#### MOLECULAR DIAGNOSTIC ANALYSIS SCRIPT ---------------

#This version of the script represents the final variant utilized for analysis,
#which was conducted between September 2022-March 2023 by Dr. Jenna Clark, a
#statistical consultant. Questions about the code can be directed to Dr. Clark
#at jenna.clark86@gmail.com.

#### LOADING SOFTWARE ------------------
#We begin with defining the R software packages needed to run the analysis.

#Graphing packages:
library(ggthemes)
library(ggfittext)
library(ggtext)
library(ggpubr)
library(psych)
library(ggrepel)
library(scales)
library(patchwork)
library(colorspace)

#Statistical packages:
library(emmeans)
library(interactions)
library(sandwich)
library(lmtest)
library(Hmisc)
#Packages for reading and writing data:
library(readr)
library(readxl)
library(clipr)

#Packages for data manipulation:
library(zoo)
library(stringi)
library(lubridate)
library(tidyverse)

#This line removes scientific notation from outputs.
options(scipen = 100)

####IMPORTING DATA----------------

#These datasets were provided by the patent vendor Harrity and then
#edited in Excel to sanitize variable names for analysis.

dfMDX<- read_csv("medDX3.csv")
dfTC1600 <- read_csv("TC16003.csv")

#All Excel-style non-applicables must be converted to NAs in R.
dfMDX[dfMDX=="#N/A"] <- NA
dfTC1600[dfTC1600=="#N/A"] <- NA

#We also include additional datasets intended to help us identify the
#assignee firms for different patents.
MDXAssignee<-read_csv("MDXAssignee.csv")
TC1600Assignee<-read_csv("TC1600Assignee.csv")
MDXNonUSInventor<-read_csv("MDXNonUSInventor.csv")
TC1600NonUSInventor<-read_csv("TC1600NonUSInventor.csv")
PlantPatents<-read_csv("PlantPatents.csv")

####FIXING DATA ISSUES------------

#In this round of data analysis, we became aware of multiple problems with the
#data that had to be corrected prior to beginning analysis.

##### Simplified Likely Setting - 9-15-2022 RETRY -----------

#We received this new dataset to help fix issues with the "simplified likely setting"
#column.

MDXApp <- read_csv("MDX Applicant.csv")
MDXAppID <- MDXApp$ApplicationNumber

#Now we can check - all patents not in this new dataset should be
#individual?
dfMDX <- dfMDX %>%
  mutate(Individ=if_else(NormalizedApplicationNumber %in% MDXAppID,"N","Y"))

#No, we must still have errors.
#There is no way 92k of these patents are individual.
table(dfMDX$Individ)

#So we'll take every appID from either this new Assignee data OR the old data.
MDXID2 <- c(MDXAppID)

#And check if patents are in that larger combined list.
dfMDX <- dfMDX %>%
  mutate(Individ2=if_else(NormalizedApplicationNumber %in% MDXID2,"N","Y"))

table(dfMDX$Individ2)

#OK, 18k seems like a plausible number of patents that should be individual.

#YES - at a quick eyeball, this looks good. These Entities typically read as people's names.
MDXIndivid <- dfMDX %>%
  filter(Individ2=="Y") %>%
  select(Entity,SimplifiedLikelySetting,Individ)

#And these all look non-individual.
MDXNonIndivid <- dfMDX %>%
  filter(Individ2=="N") %>%
  select(Entity,SimplifiedLikelySetting,Individ)

#This means we can now recode Setting as 4_likely_individual, where Individ2==Y.

dfMDX <- dfMDX %>%
  mutate(SimplifiedLikelySetting2=case_when(
    Individ2=="N" ~ SimplifiedLikelySetting,
    Individ2=="Y" ~ "4_likely_individual"
  ))

table(dfMDX$SimplifiedLikelySetting2)

#NOw we do the same for the TC1600 comparator data.

TC1600App <- read_csv("TC1600 Applicant.csv")
TC1600AppID <- TC1600App$ApplicationNumber
TC1600AssigneeID<-TC1600Assignee$ApplicationNumber

TC1600ID2 <- c(TC1600AppID,TC1600AssigneeID)

dfTC1600 <- dfTC1600 %>%
  mutate(Individ=if_else(ApplicationNumber %in% TC1600ID2,"N","Y"))

#41k seems plausible as a number of individual patents. Let's see.
table(dfTC1600$Individ)

#A surprising number of these patents have blank data for the Entity.
TC1600Individ <- dfTC1600 %>%
  filter(Individ=="Y") %>%
  select(Entity,SimplifiedLikelySetting,Individ)

#Checking out our nulls...
dfTC1600NULL <- dfTC1600 %>%
  filter(Entity=="NULL") %>%
  select(Entity,SimplifiedLikelySetting,ApplicationNumber,Individ)

#Picking 16137181 at random, it definitely has an Applicant.
#But, while Entity is NULL in both the .csv and the R dataframe,
#SimplifiedLikely looks right.

#Here we do see that there's considerable variability between the
#Nulls on LikelySetting, so that's a good sign that the Setting
#was calculated correctly even if we sadly lack the Entity data.
table(dfTC1600NULL$Individ,dfTC1600NULL$SimplifiedLikelySetting)

#So I'm going to assume the settings are correct.

dfTC1600 <- dfTC1600 %>%
  mutate(SimplifiedLikelySetting2=case_when(
    Individ=="N" ~ SimplifiedLikelySetting,
    Individ=="Y" ~ "4_likely_individual"
  ))

table(dfTC1600$SimplifiedLikelySetting2)

##### AllInventorsUS ---------------

#Sometimes this flag is listed as 1 even when all inventors are not from the US.
#Sample ID given where this is an issue is 16769579.
#AllInventorsUS is still tagged as 1 in this data for that patent.
#Looking at it in PEDS, the inventor is from Australia.

#Is this a problem with individual inventors only?
colnames(dfMDX)

#Table for country of first inventor:
table(dfMDX$CountryofFirstInventor)

#No; plenty of these patents aren't individual.
dfMDXNull <- dfMDX %>%
  filter(CountryofFirstInventor=="NULL") %>%
  select(NormalizedApplicationNumber,Entity,SimplifiedLikelySetting2,
         AllInventorsUS)

#Almost all of these Null CountryofFirstInventor ones are coded as All Inventors US.
#Is this why?
table(dfMDXNull$AllInventorsUS)


#How many apps are actually represented in the Inventor data?
MDXNUSInv <- read_csv("MDXNonUSInventor.csv")

MDXNUSInvIDs <- unique(MDXNUSInv$ApplicationNumber)

#InvUS is N if you have inventors in the dataset, which is
#non-US inventors only. Otherwise it can be Y.
#So we recalculate AllInventorsUS with a new variable.
dfMDX <- dfMDX %>%
  mutate(InvUS=if_else(NormalizedApplicationNumber %in% MDXNUSInvIDs,"N","Y"))

dfMDXNUS <- dfMDX %>%
  select(InvUS, NormalizedApplicationNumber,Entity,SimplifiedLikelySetting2,
         AllInventorsUS,CountryofFirstInventor)

#There is PRETTY good concurrence here, we just see 3500 patents where
#one metric is off. The vast majority of these seem wrongly coded as
#AllInventorsUS 1 when they are InvUS No.
table(dfMDXNUS$InvUS,dfMDXNUS$AllInventorsUS)

#What about the 200 patents that disagree in the other direction?
#These all appear to have diff countries of first inventor.
dfMDXDis <- dfMDXNUS %>%
  filter(InvUS=="Y" & AllInventorsUS==0)

#So, what we need is a new variable that demands both that the inventor isn't
#in the databse and that the Country is not equal to US.

dfMDX <- dfMDX %>%
  mutate(Domestic=case_when(
     InvUS=="Y" & CountryofFirstInventor=="US" ~"Y",
     InvUS=="Y" & CountryofFirstInventor=="NULL" ~ "Y",
     TRUE ~ "N"))
     
table(dfMDX$Domestic)

#We are left with 301 disagreements.
table(dfMDX$Domestic,dfMDX$InvUS)

#Eyeballing these disagreements, we see they do appear to be N correctly for
#Domestic, due to the CountryofFirstInventor.

dfMDXDis2 <- dfMDX %>%
  filter(Domestic=="N" & InvUS=="Y") %>%
  select(InvUS, NormalizedApplicationNumber,Entity,SimplifiedLikelySetting2,
         AllInventorsUS,CountryofFirstInventor,Domestic)

#Domestic should be correct, then.

#Let's do the same for TC1600.

#We grab inventors...
TCNUSInv <- read_csv("TC1600NonUSInventor.csv")
TCNUSInvIDs <- unique(TCNUSInv$ApplicationNumber)

#We make InvUS. We can't make Domestic because we don't have First Country.

dfTC1600 <- dfTC1600 %>%
  mutate(InvUS=if_else(ApplicationNumber %in% TCNUSInvIDs,"N","Y"))

#We see 4712 errors in one direction, 407 in the other, which isn't ideal.
#Probably the best way to handle this, then, is to assume the 4712 are
#miscoded, but the 407 are right. That matches the pattern we saw with
#MDX, where we DID have first inventor countries
table(dfTC1600$InvUS,dfTC1600$AllInventorsUS)

dfTC1600 <- dfTC1600 %>%
  mutate(Domestic=case_when(
    InvUS=="N" ~ "N",
    InvUS=="Y" & AllInventorsUS==0 ~"N",
    InvUS=="Y" & AllInventorsUS==1 ~ "Y"))

#We will spot-check a few.
dfTC1600Foreign <- dfTC1600 %>%
  filter(Domestic=="N")

#15617522 - Foreign
#13334798 - Foreign
#13332430 - Foreign
#15696889 - first inventor US but others non-US
#16630184 - Foreign.

#Looks good.

##### GrantWords ------------

#GrantClaim1TotalWords: sometimes lists #N/A when the patent is granted (see, 
#e.g. app 14545676.). 

#Digging in that's usually because it's a plant patent. (Though this is not always the problem, 
#14946595 is a chemical patent that seems to have the same problem)

#Can you remove / mark all the plant patents so we can reduce this problem? And/or fix 
#the issue so that all granted patents also include a Claim1 Word Count?

#Let's begin by identifying patents with NA words.

dfMDXGW <- dfMDX %>%
  filter(!is.na(GrantDate) & is.na(GrantClaim1TotalWords))

#We get 3804 patents.

#Which ones are plant?

PlantPatents <- read_csv("PlantPatents.csv")
PPApps <-PlantPatents$ApplicationNumber

dfMDXGW <- dfMDXGW %>%
  mutate(Plant=if_else(NormalizedApplicationNumber %in% PPApps, "Y","N"))

dfTCGW <- dfTC1600 %>%
  mutate(Plant=if_else(ApplicationNumber %in% PPApps, "Y","N"))

#OK, MOST of these are plant patents but not all.
table(dfMDXGW$Plant)

#Honestly, the best fix here is probably to omit all patents with NA GrantWords
#when dealing with granted patents.

#That's so few patents I'm fine with that approach.

#### COMPARING DATASETS -----------------------------------

#Before we begin substantive analysis, we will quickly re-run the
#consistency checks we did on prior datasets.

#We create filtered datasets of all MedDX
df2MDOnly <- dfMDX %>%
  filter(MedDX==1 & TC1600==1 & ArtUnit != "1631" & ArtUnit != "1639")

#We fix missing data and variable typing.
dfMDX <- dfMDX %>% 
  mutate_all(.,~na_if(.x, "NULL")) %>%
  mutate_all(.,~na_if(.x, "#N/A")) 

dfMDX <- dfMDX %>%
  mutate(across(contains("Date"), ~mdy(.x)))

#Variable typing here.
dfTC1600 <- dfTC1600 %>%
  mutate(across(contains("Date"), ~mdy(.x)))

dfTC1600$FilingYear <- year(dfTC1600$FilingDate)

#We create a sunset that's matched on MD only.
dfTCMDOnly <- dfTC1600 %>%
  filter(MedDxTag==1 & GroupArtUnitNumber != "1631" & GroupArtUnitNumber != "1639")

#Year by year comparison of counts.
write_clip(df2MDOnly %>%
             group_by(FileYear) %>%
             count() %>%
             ungroup)

write_clip(dfTCMDOnly %>%
             group_by(FilingYear) %>%
             count() %>%
             ungroup)

#Store the app numbers in both datasets
TCApps <- dfTCMDOnly$ApplicationNumber
MedDXApps <- df2MDOnly$NormalizedApplicationNumber

#Now, theoretically, everything that's in MDX should be iN TC1600
#is that true?
#We see 101 mismatched
df2MDextra <- df2MDOnly %>%
  filter(!NormalizedApplicationNumber %in% TCApps)

#and everything that's in MDX-only TC should be in MDX. Is that true?
#We see 957 - same as before.
dfTCMDextra <- dfTCMDOnly %>%
  filter(!ApplicationNumber %in% MedDXApps)

#Are they just disagreeing on MedDX labels?
#I.e., would we see the patents in dfTCMDextra in
#MedDX?

extraTCs <- dfTCMDextra$ApplicationNumber
extraMDs <- df2MDextra$NormalizedApplicationNumber

#Yes! They ARE all here.
dfMDXall <- dfMDX %>%
  filter(NormalizedApplicationNumber %in% extraTCs)

#They're all MedDX...
table(dfMDXall$MedDX)
#...but they're NOT tc1600!
table(dfMDXall$TechCenter)
table(dfMDXall$TC1600)

#This is a known issue that can happen when patents are
#sometimes changed in the analysis process, so we're content
#dealing with those slight mismatches.

##### Checking all possible mismatches ------------

#are there other things about the patents that are mismatched between the two
#data sets?

dfMDXa <- dfMDX %>%
  rename(ApplicationNumber=NormalizedApplicationNumber)

Check <- merge(dfMDXa, dfTC1600,by="ApplicationNumber")

colnames(Check)

#Now we check for mismatches....

#We rename to make it clear which ones are in both.
colnames(Check) = gsub("\\.x", "MDX", colnames(Check))
colnames(Check) = gsub("\\.y", "TC1600", colnames(Check))

colnames(Check)

table(dfTC1600$ConType)
table(dfMDX$CONType)

#Now we want to rename across variables that're in both datasets but have
#slightly different specifications.

#Then we can generate a Mismatch variable for each.

Check2 <- Check %>%
  rename(FileDateMDX=FileDate,
         FileDateTC1600=FilingDate,
         ArtUnitMDX=ArtUnit,
         ArtUnitTC1600=GroupArtUnitNumber,
         MedDxTagMDX=MedDX,
         MedDxTagTC1600=MedDxTag,
         ConTypeMDX=CONType,
         ConTypeTC1600=ConType) %>%
  mutate(ApplicationStatusDateMDX=as.Date(ApplicationStatusDateMDX),
         ApplicationStatusDateTC1600=as.Date(ApplicationStatusDateTC1600)) %>%
  select(ApplicationNumber,contains(c("MDX","TC1600"))) %>%
  mutate(PublicationNumberMismatch=if_else(PublicationNumberMDX==PublicationNumberTC1600,"N","Y"),
         FileDateMismatch=if_else(FileDateMDX==FileDateTC1600,"N","Y"),
         PublicationDateMismatch=if_else(PublicationDateMDX==PublicationDateTC1600,"N","Y"),
         GrantDateMismatch=if_else(GrantDateMDX==GrantDateTC1600,"N","Y"),
         DomesticMismatch=if_else(DomesticMDX==DomesticTC1600,"N","Y"),
         ArtUnitMismatch=if_else(ArtUnitMDX==ArtUnitTC1600,"N","Y"),
         ApplicationStatusDateMismatch=if_else(ApplicationStatusDateMDX==ApplicationStatusDateTC1600,"N","Y"),
         ConTypeMismatch=if_else(ConTypeMDX==ConTypeTC1600,"N","Y"),
         MedDxTagMismatch=if_else(MedDxTagMDX==MedDxTagTC1600,"N","Y"),
         SLSMismatch=if_else(SimplifiedLikelySetting2MDX==SimplifiedLikelySetting2TC1600,"N","Y"),
         ApplicationStatusDiff=ApplicationStatusDateTC1600-ApplicationStatusDateMDX)

#We produce a table of all mismatches
#Everything except AppStatusDate looks great. And we don't use AppStatus
#for much, so this is reassuring.
CheckTables <- Check2 %>%
  select(contains("Mismatch"))

#### CREATING AND REFORMATTING VARIABLES ------------------

#Are there duplicates in either dataset?

#2 here - not concerned
dupIdMDX<-dfMDX[duplicated(dfMDX$NormalizedApplicationNumber),]

#only 1 here - hooray!
dupIDTC<-dfTC1600[duplicated(dfTC1600$ApplicationNumber),]

#Our older code uses df1 for dfMDX, so we will do the same to avoid
#a tremendous amount of changes.

df1 <- dfMDX

##### Recoding setting/entity -------------------

#We recode Setting for readability.
df1 <- df1 %>%
  mutate(SLS2=case_when(
    SimplifiedLikelySetting == "01_likely_nonprofit_or_govt" ~ "Non-Profit or Govt.",
    SimplifiedLikelySetting == "2_likely_large_co" ~ "Large Company",
    SimplifiedLikelySetting == "3_likely_small_co" ~ "Small Company",
    SimplifiedLikelySetting == "45_unassigned_individual_or_micro" ~ "Individual Inventor"))

table(df1$SLS2)

#We also want Setting split into small and large.

df1 <- df1 %>%
  mutate(SLS2Bin=if_else(SLS2=="Large Company","Large","Small"))

##### Recoding application status variables --------------

#If an app has a Grant date, it is granted.
df1 <- df1 %>%
  mutate(Granted = ifelse(!is.na(GrantDate),1,0))

#If an app has a publication date, it is published
df1 <- df1 %>%
  mutate(Published = ifelse(!is.na(PublicationDate),1,0))

#If an app has a Pendency to Abandon, it is abandoned.
df1 <- df1 %>%
  mutate(Abandoned = ifelse(!is.na(PendencytoAbandon),1,0))

##### Recoding dates and doing date math -----------------------

#All dates should be... dates.)
df1 <- df1 %>%
  mutate(across(contains("Date"), ~as.Date(.x,format="%m/%d/%Y")))

#FOr this to work, R has to know the Words variables are numeric.
df1 <- df1 %>%
  mutate(across(contains("Words"), ~as.numeric(.x)))

#Pendency to Publication must be generated.
df1$PendencytoPub <- df1$PublicationDate-df1$FileDate

#We will calculate PendencytoGrant from scratch to make it the right class.
df1$PendencytoGrant <- df1$GrantDate-df1$FileDate

#Continuation vs. new
df2 <- df1 %>%
  mutate(Continuation = if_else(grepl("Continuation",CONType),"Continuation","New Patent" ))

##### Reformatting variables as numeric --------

#Rejections as numeric.
df2 <- df2 %>%
  mutate(across(contains("Rejection"), ~as.numeric(.x)))

#So we can do math on them...
df2 <- df2 %>%
  mutate(Any101Rej = case_when(
    Total101Rejections == 0 ~ 0,
    Total101Rejections > 0 ~ 1,
    TRUE ~ NA_real_
  ))

df2 <- df2 %>%
  mutate(SumRejections = Total101Rejections+Total102Rejections+
           Total103Rejections+Total112Rejections) %>%
  mutate(Perc101Rej = case_when(
    is.na(SumRejections) ~ NA_real_,
    SumRejections ==0 ~ NA_real_,
    SumRejections > 0 ~ Total101Rejections/SumRejections))

#Word fields must all also be numeric.
df2 <- df2 %>%
  mutate(across(contains("Words"), ~as.numeric(.x)))

##### Creating treatment variables -----------------------

#We drop the incorrect AUs...
df2 <- df2 %>%
  filter(ArtUnit != "1631" & ArtUnit != "1639")

#First, we create Treatment variables for later analysis
df2 <- df2 %>%
  mutate(TreatmentMDX = case_when(
    TC1600 == 1 & MedDX == 0 ~ "0",
    MedDX == 1 ~ "1"))

df2$TreatmentMDX<-as.factor(df2$TreatmentMDX)

##### Creating time variables -------------------

#Now we need to create Quarter dates and Continuation status.

df2 <- df2 %>%
  mutate(across(contains("Pendency"), ~as.numeric(.x)))

df2 <- df2 %>%
  mutate(AbandonDate = FileDate+PendencytoAbandon)

df2 <- df2 %>%
  mutate(ResolutionDate=if_else(Granted==1,GrantDate,AbandonDate))

df2$PendencytoResolution <-df2$ResolutionDate-df2$FileDate

#We will also need years of everything.
#We have FileYear, GrantYear, and PriorityYear. 

#We need FirstOAYear and ResolutionYear.

df2 <- df2 %>%
  mutate(FirstOAYear=year(FirstOADate),
         ResolutionYear=year(ResolutionDate))

#We will also want months for the sake of tabular processing.

df2 <- df2 %>%
  mutate(FirstOAMonth=as.yearmon(FirstOADate),
         ResolutionMonth=as.yearmon(ResolutionDate),
         GrantMonth=as.yearmon(GrantDate),
         FileMonth=as.yearmon(FileDate),
         PubMonth=as.yearmon(PublicationDate))

#ANd some Data Limits.

StartDate <- mdy("12-31-2009")
EndDate <- mdy("01-01-2020")

##### Creating Mayo status -------------------------

#We are re-ordering/renaming hypotheses here to match the paper
#and not the original table.

#Hypothesis 1/2: We need FileDate and GrantDate by Mayo,
#and we need PubDate+15 months by Mayo.

MayoDate <- mdy("3-20-2012")
Mayo2Years <- mdy("3-20-2014")
PubMayo <- mdy("6-20-2013")
PubMayo2Years <- mdy("6-20-2015")

df2 <- df2 %>%
  mutate(MayoFile = case_when(
    FileDate < MayoDate ~ "Before",
    FileDate > MayoDate  ~ "After",
    TRUE ~ NA_character_))  

df2 <- df2 %>%
  mutate(MayoFileIm = case_when(
    FileDate > MayoDate & FileDate < Mayo2Years ~ "Yes",
    TRUE  ~ "No")) 

df2 <- df2 %>%
  mutate(MayoGrant = case_when(
    GrantDate < MayoDate ~ "Before",
    GrantDate > MayoDate  ~ "After",
    TRUE ~ NA_character_))  

df2 <- df2 %>%
  mutate(MayoGrantIm = case_when(
    GrantDate > MayoDate & GrantDate < Mayo2Years ~ "Yes",
    TRUE  ~ "No")) 

df2 <- df2 %>%
  mutate(MayoPub = case_when(
    PublicationDate < PubMayo ~ "Before",
    PublicationDate > PubMayo  ~ "After",
    TRUE ~ NA_character_))  

df2 <- df2 %>%
  mutate(MayoPubIm = case_when(
    PublicationDate > PubMayo & PublicationDate < PubMayo2Years ~ "Yes",
    TRUE  ~ "No")) 

#Hypothesis 3: 
#We need to know if Grant Rates decline, utilizing Resolution Date.

df2 <- df2 %>%
  mutate(MayoRes = case_when(
    ResolutionDate < MayoDate ~ "Before",
    ResolutionDate > MayoDate  ~ "After",
    TRUE ~ NA_character_))  

df2 <- df2 %>%
  mutate(MayoResIm = case_when(
    ResolutionDate > MayoDate & ResolutionDate < Mayo2Years ~ "Yes",
    TRUE  ~ "No")) 

#Hypothesis 4:
#We need to know if claim1 length got longer, utilizing Grant Date.

#Grant date has already been calculated.

#Hypothesis 5:
#Time to resolution by Resolution Date; already calculated.

#Hypothesis 5: Delta between claims got longer; GrantDate already
#calculated.

#Hypothesis 6: More detail/words, overnight as of FileDate. Filedate's
#already been calculated.

#Now we want all Mayotimes ordered right...

df2 <- df2 %>%
  mutate(across(c(MayoPub,MayoRes,MayoFile,MayoGrant), 
                ~factor(.x, levels=c("Before","After"))))

#### DESCRIPTIVES ------------------

#####How long does it take patents to be granted?-----------

colnames(dfTC1600)

dfTC1600 <- dfTC1600 %>%
  mutate(across(contains("Date"), ~as.Date(.x,format="%Y/%m/%d"))) %>%
  mutate(PendencytoGrant=GrantDate-FilingDate,
         FilingYear=year(FilingDate)) %>%
  mutate(PendencyYear=round(PendencytoGrant/365,0))

table(dfTC1600$PendencyYear)

PendencyByYear <- dfTC1600 %>%
  filter(!is.na(GrantDate)) %>%
  select(FilingYear,PendencyYear) %>%
  group_by(FilingYear) %>%
  summarize(meanPendencyYears=as.numeric(mean(PendencyYear, na.rm=TRUE))) %>%
  ungroup

#### HYPOTHESIS 1: PATENTING ACTIVITY ---------------

##### H1 Data Creation------------------

#This is tricky because we need counts for apps, grants, and pubs,
#and we need them combining the two datasets. This is the only time this
#is the case, because this hypothesis is the only one relying on absolute
#total counts, as opposed to looking at subsections of patents.

#So we take only the MedDX status and the most relevant variables from MDX...
medDXOnly <- df2 %>%
  filter(TreatmentMDX==1) %>%
  select(TreatmentMDX,NormalizedApplicationNumber,PublicationNumber,Granted,FileDate,PublicationDate,
         GrantDate) 

#And then ditto for the TC1600 sample.

TC1600only <- dfTC1600 %>%
  filter(GroupArtUnitNumber != "1631" & GroupArtUnitNumber != "1639" & MedDxTag==0) %>%
  select(MedDxTag,ApplicationNumber,PublicationNumber, FilingDate,PublicationDate,GrantDate) %>%
  mutate(Granted = ifelse(!is.na(GrantDate),1,0)) %>%
  rename(NormalizedApplicationNumber=ApplicationNumber,FileDate=FilingDate,TreatmentMDX=MedDxTag)

H1Data <- rbind(medDXOnly,TC1600only)

#Did any duplicates end up in here?
#We see one duplicated patent, perhaps.
dupIdH1<-H1Data[duplicated(H1Data$NormalizedApplicationNumber),]

#PEDS suggests the 8-10 file date is more correct.
#We manually remove the older duplicate by publication number.
H1Data <- subset(H1Data,PublicationNumber != "US20220023166A1" | is.na(PublicationNumber))

#Now we need Mayos for THESE

H1Data <- H1Data %>%
  mutate(MayoGrant = case_when(
    GrantDate < MayoDate ~ "Before",
    GrantDate > MayoDate  ~ "After",
    TRUE ~ NA_character_),
    MayoPub = case_when(
    PublicationDate < PubMayo ~ "Before",
    PublicationDate > PubMayo  ~ "After",
    TRUE ~ NA_character_),
    MayoFile = case_when(
    FileDate < MayoDate ~ "Before",
    FileDate > MayoDate  ~ "After",
    TRUE ~ NA_character_))  

H1Data <- H1Data %>%
  mutate(across(c(MayoPub,MayoFile,MayoGrant,), 
                ~factor(.x, levels=c("Before","After"))))

H1Data <- H1Data %>%
  mutate(GrantMonth=as.yearmon(GrantDate),
         FileMonth=as.yearmon(FileDate),
         PubMonth=as.yearmon(PublicationDate),
         GrantYear=year(GrantDate),
         FileYear=year(FileDate),
         PubYear=year(PublicationDate)
  )

#### HYPOTHESIS 1: PATENTING ACTIVITY - TABLES --------------

#I am keeping in limiters for start/end dates because otherwise
#the data is very messed up by the uneven info.

AppsMDX <- H1Data %>%
  filter(!is.na(TreatmentMDX) & !is.na(MayoFile) &
           FileDate > StartDate & FileDate < EndDate) %>%
  select(MayoFile, NormalizedApplicationNumber,TreatmentMDX,FileMonth) %>%
  group_by(MayoFile,TreatmentMDX,FileMonth) %>%
  summarize(AppNum=n()) %>%
  ungroup()

PubsMDX <- H1Data %>%
  filter(!is.na(TreatmentMDX) & !is.na(MayoPub) &
           PublicationDate > StartDate & PublicationDate < EndDate) %>%
  select(MayoPub,PublicationNumber,TreatmentMDX,PubMonth) %>%
  mutate(PubNum = ifelse(!is.na(PublicationNumber),1,0)) %>%
  group_by(MayoPub,TreatmentMDX,PubMonth) %>%
  summarize(PubSum=sum(PubNum, na.rm=TRUE))  %>%
  ungroup()

GrantsMDX <- H1Data %>%
  filter(!is.na(TreatmentMDX) & !is.na(MayoGrant) &
           GrantDate > StartDate & GrantDate < EndDate) %>%
  select(MayoGrant,Granted,TreatmentMDX,GrantMonth) %>%
  group_by(MayoGrant,TreatmentMDX,GrantMonth) %>%
  summarize(GrantSum=sum(Granted, na.rm=TRUE)) %>%
  ungroup() 

#Mayo versions

AppsMDXMayo <- AppsMDX %>%
  group_by(TreatmentMDX,MayoFile) %>%
  summarize(AppNum=mean(AppNum, na.rm=TRUE)) %>%
  pivot_wider(names_from=MayoFile, values_from=AppNum) %>%
  ungroup()

write_clip(as.matrix(unname(AppsMDXMayo[1:2,2:3])))

PubsMDXMayo <- PubsMDX %>%
  group_by(TreatmentMDX,MayoPub) %>%
  summarize(PubSum=mean(PubSum, na.rm=TRUE)) %>%
  pivot_wider(names_from=MayoPub, values_from=PubSum) %>%
  ungroup()

write_clip(as.matrix(unname(PubsMDXMayo[1:2,2:3])))

GrantsMDXMayo <- GrantsMDX %>%
  group_by(TreatmentMDX,MayoGrant) %>%
  summarize(GrantSum=mean(GrantSum, na.rm=TRUE)) %>%
  pivot_wider(names_from=MayoGrant, values_from=GrantSum) %>%
  ungroup()

write_clip(as.matrix(unname(GrantsMDXMayo[1:2,2:3])))

#Yearly versions

AppsMDXY <- H1Data %>%
  filter(!is.na(TreatmentMDX) & !is.na(MayoFile)) %>%
  select(NormalizedApplicationNumber,TreatmentMDX,FileYear) %>%
  group_by(TreatmentMDX,FileYear) %>%
  summarize(AppNum=n()) %>%
  ungroup()

write_clip(as.matrix(unname(AppsMDXY)))

PubsMDXY <- H1Data %>%
  filter(!is.na(TreatmentMDX) & !is.na(MayoPub)) %>%
  select(PublicationNumber,TreatmentMDX,PubYear) %>%
  mutate(PubNum = ifelse(!is.na(PublicationNumber),1,0)) %>%
  group_by(TreatmentMDX,PubYear) %>%
  summarize(PubSum=sum(PubNum, na.rm=TRUE))  %>%
  ungroup()

write_clip(as.matrix(unname(PubsMDXY)))

GrantsMDXY <- H1Data %>%
  filter(!is.na(TreatmentMDX) & !is.na(MayoGrant)) %>%
  select(Granted,TreatmentMDX,GrantYear) %>%
  group_by(TreatmentMDX,GrantYear) %>%
  summarize(GrantSum=sum(Granted, na.rm=TRUE)) %>%
  ungroup() 

write_clip(as.matrix(unname(GrantsMDXY)))

#### HYPOTHESIS 1: PATENTING ACTIVITY - ANALYSIS ----------------

#Parallel trends

AppsDID <- H1Data %>%
  filter(!is.na(TreatmentMDX) & !is.na(MayoFile) &
           FileDate > StartDate & FileDate < EndDate) %>%
  select(MayoFile, NormalizedApplicationNumber,TreatmentMDX,FileDate) %>%
  group_by(MayoFile,TreatmentMDX,FileDate) %>%
  summarize(AppNum=n()) %>%
  ungroup()

PubsDID <- H1Data %>%
  filter(!is.na(TreatmentMDX) & !is.na(MayoPub) &
           PublicationDate > StartDate & PublicationDate < EndDate) %>%
  select(MayoPub,PublicationNumber,TreatmentMDX,PublicationDate) %>%
  mutate(PubNum = ifelse(!is.na(PublicationNumber),1,0)) %>%
  group_by(MayoPub,TreatmentMDX,PublicationDate) %>%
  summarize(PubSum=sum(PubNum, na.rm=TRUE))  %>%
  ungroup()

GrantsDID <- H1Data %>%
  filter(!is.na(TreatmentMDX) & !is.na(MayoGrant) & !is.na(MayoGrant) &
           GrantDate > StartDate & GrantDate < EndDate) %>%
  select(MayoGrant,Granted,TreatmentMDX,GrantDate) %>%
  group_by(MayoGrant,TreatmentMDX,GrantDate) %>%
  summarize(GrantSum=sum(Granted, na.rm=TRUE)) %>%
  ungroup() 

AppsPT <- AppsDID %>%
  filter(MayoFile=="Before")

PubsPT <- PubsDID %>%
  filter(MayoPub=="Before")

GrantsPT <- GrantsDID %>%
  filter(MayoGrant=="Before")

#Apps are parallel
modelAppsPT <- lm(AppNum ~ FileDate*TreatmentMDX, 
                  data=AppsPT)
summary(modelAppsPT)
coeftest(modelAppsPT, vcov. = vcovHC(modelAppsPT, type='HC3'))

#Pubs are NOT parallel
modelPubsPT <- lm(PubSum ~ PublicationDate*TreatmentMDX, 
                  data=PubsPT)
summary(modelPubsPT)
coeftest(modelPubsPT, vcov. = vcovHC(modelPubsPT, type='HC3'))

#GRants are NOT parallel
modelGrantsPT <- lm(GrantSum ~ GrantDate*TreatmentMDX, 
                    data=GrantsPT)
summary(modelGrantsPT)
coeftest(modelGrantsPT, vcov. = vcovHC(modelGrantsPT, type='HC3'))

#DiD ANalysis for Apps
#No effect
modelAppsDID<- lm(AppNum ~ MayoFile*TreatmentMDX + as.Date(FileDate), 
                  data=AppsDID)
summary(modelAppsDID)
coeftest(modelAppsDID, vcov. = vcovHC(modelAppsDID, type='HC3'))

modelAppsDIDEM <- emmeans(modelAppsDID, ~MayoFile*TreatmentMDX,
                          vcov = sandwich::vcovHC(modelAppsDID,type='HC3'))

em<-as.data.frame(modelAppsDIDEM)
em2<-em %>%
  select(MayoFile,TreatmentMDX,emmean,SE) %>%
  pivot_wider(names_from=MayoFile,values_from=c(emmean,SE)) %>%
  mutate(TreatmentMDX = ifelse(TreatmentMDX==0,"Control","Treatment")) %>%
  mutate(CombB=paste0(round(emmean_Before,2)," (",round(SE_Before,2),")"),
         CombA=paste0(round(emmean_After,2)," (",round(SE_After,2),")"))

write_clip(unname(em2[1:2,6:7]))

#### HYPOTHESIS 1A: PATENTING ACTIVITY - TABULAR -----------------

#We must first duplicate the above steps but do it only for the small
#US based ones

#And we also need a variant of df2.

#NOTE - We use the new variable "Domestic" instead of AllInventorsUS,
#as figured earlier in this code. Ditto for SimplifiedLIkelySetting2 
#replacing Entity.

medDXOnly2 <- df2 %>%
  filter(TreatmentMDX==1 & Domestic=="Y" & SimplifiedLikelySetting2=="3_likely_small_co") %>%
  select(TreatmentMDX,NormalizedApplicationNumber,PublicationNumber,Granted,FileDate,PublicationDate,
         GrantDate) 

#ANd then ditto for the TC1600 sample

TC1600only2 <- dfTC1600 %>%
  filter(GroupArtUnitNumber != "1631" & GroupArtUnitNumber != "1639" 
         & Domestic=="Y" & SimplifiedLikelySetting2=="3_likely_small_co" & MedDxTag == 0) %>%
  select(MedDxTag, ApplicationNumber,PublicationNumber,FilingDate,PublicationDate,GrantDate) %>%
  mutate(Granted = ifelse(!is.na(GrantDate),1,0)) %>%
  rename(NormalizedApplicationNumber=ApplicationNumber,FileDate=FilingDate,TreatmentMDX=MedDxTag)

H1Data2 <- rbind(medDXOnly2,TC1600only2)

#Now we need to add date and Mayo data.

H1Data2 <- H1Data2 %>%
  mutate(MayoGrant = case_when(
    GrantDate < MayoDate ~ "Before",
    GrantDate > MayoDate  ~ "After",
    TRUE ~ NA_character_))  

PubMayo <- mdy("6-20-2013")
PubMayo2Years <- mdy("6-20-2015")

H1Data2 <- H1Data2 %>%
  mutate(MayoPub = case_when(
    PublicationDate < PubMayo ~ "Before",
    PublicationDate > PubMayo  ~ "After",
    TRUE ~ NA_character_))  

H1Data2 <- H1Data2 %>%
  mutate(MayoFile = case_when(
    FileDate < MayoDate ~ "Before",
    FileDate > MayoDate  ~ "After",
    TRUE ~ NA_character_))  

H1Data2 <- H1Data2 %>%
  mutate(across(c(MayoPub,MayoFile,MayoGrant,), 
                ~factor(.x, levels=c("Before","After"))))

H1Data2 <- H1Data2 %>%
  mutate(GrantMonth=as.yearmon(GrantDate),
         FileMonth=as.yearmon(FileDate),
         PubMonth=as.yearmon(PublicationDate),
         GrantYear=year(GrantDate),
         FileYear=year(FileDate),
         PubYear=year(PublicationDate)
  )

#Now for the real tables

AppsMDX2 <- H1Data2 %>%
  filter(!is.na(TreatmentMDX) & !is.na(MayoFile) &
           FileDate > StartDate & FileDate < EndDate) %>%
  select(MayoFile, NormalizedApplicationNumber,TreatmentMDX,FileMonth) %>%
  group_by(MayoFile,TreatmentMDX,FileMonth) %>%
  summarize(AppNum=n()) %>%
  ungroup()

PubsMDX2 <- H1Data2 %>%
  filter(!is.na(TreatmentMDX) & !is.na(MayoPub) &
           PublicationDate > StartDate & PublicationDate < EndDate) %>%
  select(MayoPub,PublicationNumber,TreatmentMDX,PubMonth) %>%
  mutate(PubNum = ifelse(!is.na(PublicationNumber),1,0)) %>%
  group_by(MayoPub,TreatmentMDX,PubMonth) %>%
  summarize(PubSum=sum(PubNum, na.rm=TRUE))  %>%
  ungroup()

GrantsMDX2 <- H1Data2 %>%
  filter(!is.na(TreatmentMDX) & !is.na(MayoGrant) & !is.na(MayoGrant) &
           GrantDate > StartDate & GrantDate < EndDate) %>%
  select(MayoGrant,Granted,TreatmentMDX,GrantMonth) %>%
  group_by(MayoGrant,TreatmentMDX,GrantMonth) %>%
  summarize(GrantSum=sum(Granted, na.rm=TRUE)) %>%
  ungroup() 

#Mayo versions

AppsMDXMayo2 <- AppsMDX2 %>%
  group_by(TreatmentMDX,MayoFile) %>%
  summarize(AppNum=mean(AppNum, na.rm=TRUE)) %>%
  pivot_wider(names_from=MayoFile, values_from=AppNum) %>%
  ungroup()

write_clip(as.matrix(unname(AppsMDXMayo2[1:2,2:3])))

PubsMDXMayo2 <- PubsMDX2 %>%
  group_by(TreatmentMDX,MayoPub) %>%
  summarize(PubSum=mean(PubSum, na.rm=TRUE)) %>%
  pivot_wider(names_from=MayoPub, values_from=PubSum) %>%
  ungroup()

write_clip(as.matrix(unname(PubsMDXMayo2[1:2,2:3])))

GrantsMDXMayo2 <- GrantsMDX2 %>%
  group_by(TreatmentMDX,MayoGrant) %>%
  summarize(GrantSum=mean(GrantSum, na.rm=TRUE)) %>%
  pivot_wider(names_from=MayoGrant, values_from=GrantSum) %>%
  ungroup()

write_clip(as.matrix(unname(GrantsMDXMayo2[1:2,2:3])))


#### HYPOTHESIS 1A: PATENTING ACTIVITY - ANALYSIS ----------------

#Same as 1 but among small US-based only

#Parallel trends

AppsDID <- H1Data2 %>%
  filter(!is.na(TreatmentMDX) & !is.na(MayoFile) &
           FileDate > StartDate & FileDate < EndDate) %>%
  select(MayoFile, NormalizedApplicationNumber,TreatmentMDX,FileDate) %>%
  group_by(MayoFile,TreatmentMDX,FileDate) %>%
  summarize(AppNum=n()) %>%
  ungroup()

PubsDID <- H1Data2 %>%
  filter(!is.na(TreatmentMDX) & !is.na(MayoPub) &
           PublicationDate > StartDate & PublicationDate < EndDate) %>%
  select(MayoPub,PublicationNumber,TreatmentMDX,PublicationDate) %>%
  mutate(PubNum = ifelse(!is.na(PublicationNumber),1,0)) %>%
  group_by(MayoPub,TreatmentMDX,PublicationDate) %>%
  summarize(PubSum=sum(PubNum, na.rm=TRUE))  %>%
  ungroup()

GrantsDID <- H1Data2 %>%
  filter(!is.na(TreatmentMDX) & !is.na(MayoGrant) & !is.na(MayoGrant) &
           GrantDate > StartDate & GrantDate < EndDate) %>%
  select(MayoGrant,Granted,TreatmentMDX,GrantDate) %>%
  group_by(MayoGrant,TreatmentMDX,GrantDate) %>%
  summarize(GrantSum=sum(Granted, na.rm=TRUE)) %>%
  ungroup() 

AppsPT <- AppsDID %>%
  filter(MayoFile=="Before")

PubsPT <- PubsDID %>%
  filter(MayoPub=="Before")

GrantsPT <- GrantsDID %>%
  filter(MayoGrant=="Before")

#Apps are parallel
modelAppsPT <- lm(AppNum ~ FileDate*TreatmentMDX, 
                  data=AppsPT)
summary(modelAppsPT)
coeftest(modelAppsPT, vcov. = vcovHC(modelAppsPT, type='HC3'))

#Pubs are parallel
modelPubsPT <- lm(PubSum ~ PublicationDate*TreatmentMDX, 
                  data=PubsPT)
summary(modelPubsPT)
coeftest(modelPubsPT, vcov. = vcovHC(modelPubsPT, type='HC3'))

#GRants are NOT parallel
modelGrantsPT <- lm(GrantSum ~ GrantDate*TreatmentMDX, 
                    data=GrantsPT)
summary(modelGrantsPT)
coeftest(modelGrantsPT, vcov. = vcovHC(modelGrantsPT, type='HC3'))

#DiD ANalysis for Apps
#Significant!
modelAppsDID<- lm(AppNum ~ MayoFile*TreatmentMDX + as.Date(FileDate), 
                  data=AppsDID)
summary(modelAppsDID)

#Testing for HSCD, we find it
plot(modelAppsDID)
bptest(modelAppsDID)

#So we use robust standard errors.

coeftest(modelAppsDID, vcov. = vcovHC(modelAppsDID, type='HC3'))

#Graph to probe
cat_plot(modelAppsDID, pred = MayoFile, modx = TreatmentMDX, geom = "line",
         interval = FALSE,
         colors=c("black", "blue"),
         modx.labels=c("Control","Treatment"),
         legend.main="Condition:")  +
  labs(y="Number of Applications",
       x="Mayo Timing of File Date")+
  theme_classic()+
  theme(text=element_text(size=14)) +
  theme(axis.title.y=element_text(margin=margin(0,15,0,0))) +
  theme(axis.title.x=element_text(margin=margin(15,0,0,0))) +
  theme(plot.margin = margin(0.5,1,0.5,0.5, "cm")) +
theme(text=element_text(size=14))

modelAppsDIDEM <- emmeans(modelAppsDID, ~MayoFile*TreatmentMDX,
                          vcov = sandwich::vcovHC(modelAppsDID,type='HC3'))

em<-as.data.frame(modelAppsDIDEM)
em2<-em %>%
  select(MayoFile,TreatmentMDX,emmean,SE) %>%
  pivot_wider(names_from=MayoFile,values_from=c(emmean,SE)) %>%
  mutate(TreatmentMDX = ifelse(TreatmentMDX==0,"Control","Treatment")) %>%
  mutate(CombB=paste0(round(emmean_Before,2)," (",round(SE_Before,2),")"),
       CombA=paste0(round(emmean_After,2)," (",round(SE_After,2),")"))

write_clip(unname(em2[1:2,6:7]))

#First we obtain the RMSE
RSS <- c(crossprod(modelAppsDID$residuals))
MSE <- RSS / length(modelAppsDID$residuals)
RMSE <- sqrt(MSE)

#Now we need cell means
AppsData2 <- AppsDID %>%
  filter(!is.na(MayoFile)) %>%
  group_by(TreatmentMDX,MayoFile) %>%
  summarize(mean=mean(AppNum, na.rm=TRUE))

A1<-AppsData2$mean[1]
A2<-AppsData2$mean[2]
B1<-AppsData2$mean[3]
B2<-AppsData2$mean[4]

#Then we can calculate Cohen's d
#Effect size of -0.25
d=((A1-B1) - (A2-B2))/(2*RMSE)

#DiD for Pubs

#Significant!
modelPubsDID<- lm(PubSum ~ MayoPub*TreatmentMDX + as.Date(PublicationDate), 
                  data=PubsDID)
summary(modelPubsDID)
coeftest(modelPubsDID, vcov. = vcovHC(modelPubsDID, type='HC3'))

#Graph to probe
cat_plot(modelPubsDID, pred = MayoPub, modx = TreatmentMDX, geom = "line",
         interval = FALSE,
         colors=c("black", "blue"),
         modx.labels=c("Control","Treatment"),
         legend.main="Condition:")  +
  labs(y="Number of Publications",
       x="Mayo Timing of Publication Date")+
  theme_classic()+
  theme(text=element_text(size=14)) +
  theme(axis.title.y=element_text(margin=margin(0,15,0,0))) +
  theme(axis.title.x=element_text(margin=margin(15,0,0,0))) +
  theme(plot.margin = margin(0.5,1,0.5,0.5, "cm")) +
  theme(text=element_text(size=14))

modelPubsDIDEM <- emmeans(modelPubsDID, ~MayoPub*TreatmentMDX,
                          vcov = sandwich::vcovHC(modelPubsDID,type='HC3'))

em<-as.data.frame(modelPubsDIDEM)
em2<-em %>%
  select(MayoPub,TreatmentMDX,emmean) %>%
  pivot_wider(names_from=MayoPub,values_from=emmean) %>%
  mutate(TreatmentMDX = ifelse(TreatmentMDX==0,"Control","Treatment"))
write_clip(unname(em2[1:2,2:3]))

#First we obtain the RMSE
RSS <- c(crossprod(modelPubsDID$residuals))
MSE <- RSS / length(modelPubsDID$residuals)
RMSE <- sqrt(MSE)

#Now we need cell means
PubsData2 <- PubsDID %>%
  filter(!is.na(MayoPub)) %>%
  group_by(TreatmentMDX,MayoPub) %>%
  summarize(mean=mean(PubSum, na.rm=TRUE))

A1<-PubsData2$mean[1]
A2<-PubsData2$mean[2]
B1<-PubsData2$mean[3]
B2<-PubsData2$mean[4]

#Then we can calculate Cohen's d
#Effect size of --0.75
d=((A1-B1) - (A2-B2))/(2*RMSE)

#### HYPOTHESIS 2: SHARE OF NEW APPS - TABULAR ----------------

AppStatus <- df2 %>%
  filter(!is.na(TreatmentMDX) & !is.na(MayoFile) &
           FileDate > StartDate & FileDate < EndDate) %>%
  select(AppSerialNumber,Continuation,TreatmentMDX,FileMonth,MayoFile,MayoFileIm) %>%
  group_by(Continuation,TreatmentMDX,FileMonth,MayoFile,MayoFileIm) %>%
  summarize(AppNum=n()) %>%
  pivot_wider(names_from=Continuation, values_from=AppNum) %>%
  rowwise() %>%
  mutate(Total=sum(c(Continuation,`New Patent`),na.rm=TRUE),
         PercNew=`New Patent`/Total)

AppStatusMayo <- AppStatus %>%
  select(PercNew,TreatmentMDX,MayoFile) %>%
  group_by(TreatmentMDX,MayoFile) %>%
  summarize(PercNew=mean(PercNew, na.rm=TRUE)) %>%
  pivot_wider(names_from=MayoFile, values_from=PercNew) %>%
  ungroup()

write_clip(as.matrix(unname(AppStatusMayo[1:2,2:3])))

#### HYPOTHESIS 2: SHARE OF NEW APPS - ANALYSIS ----------------

AppStatusData <- df2 %>%
  filter(!is.na(TreatmentMDX) & !is.na(MayoFile) &
           FileDate > StartDate & FileDate < EndDate) %>%
  select(AppSerialNumber,Continuation,TreatmentMDX,FileDate,MayoFile) %>%
  group_by(Continuation,TreatmentMDX,FileDate,MayoFile) %>%
  summarize(AppNum=n()) %>%
  pivot_wider(names_from=Continuation, values_from=AppNum) %>%
  rowwise() %>%
  mutate(Total=sum(c(Continuation,`New Patent`),na.rm=TRUE),
         PercNew=`New Patent`/Total)

#Now a PT dataset
AppStatusPT <- AppStatusData %>%
  filter(MayoFile=="Before")

#Share of new is not parallel.
modelAppStatusPT <- lm(PercNew ~ FileDate*TreatmentMDX, 
                       data=AppStatusPT)
summary(modelAppStatusPT)
coeftest(modelAppStatusPT, vcov. = vcovHC(modelAppStatusPT, type='HC3'))

#### HYPOTHESIS 2a: SHARE OF NEW APPS - TABULAR ----------------

#Only difference is using df3.

#Here, we do not need H1Data because we don't need the absolute raw
#number. We can calculate a percentage just fine from the subset of
#data in mdx.

df3 <- df2 %>%
  filter(Domestic=="Y" & SimplifiedLikelySetting2=="3_likely_small_co")

AppStatus2 <- df3 %>%
  filter(!is.na(TreatmentMDX) & !is.na(MayoFile) &
           FileDate > StartDate & FileDate < EndDate) %>%
  select(AppSerialNumber,Continuation,TreatmentMDX,FileMonth,MayoFile,MayoFileIm) %>%
  group_by(Continuation,TreatmentMDX,FileMonth,MayoFile,MayoFileIm) %>%
  summarize(AppNum=n()) %>%
  pivot_wider(names_from=Continuation, values_from=AppNum) %>%
  rowwise() %>%
  mutate(Total=sum(c(Continuation,`New Patent`),na.rm=TRUE),
         PercNew=`New Patent`/Total)

AppStatusMayo2 <- AppStatus2 %>%
  select(PercNew,TreatmentMDX,MayoFile) %>%
  group_by(TreatmentMDX,MayoFile) %>%
  summarize(PercNew=mean(PercNew, na.rm=TRUE)) %>%
  pivot_wider(names_from=MayoFile, values_from=PercNew) %>%
  ungroup()

write_clip(as.matrix(unname(AppStatusMayo2[1:2,2:3])))

#### HYPOTHESIS 2a: SHARE OF NEW APPS - ANALYSIS ----------------

#Repeated for small US-only 
#Only change is using df3.

AppStatusData <- df3 %>%
  filter(!is.na(TreatmentMDX) & !is.na(MayoFile) &
           FileDate > StartDate & FileDate < EndDate) %>%
  select(AppSerialNumber,Continuation,TreatmentMDX,FileDate,MayoFile) %>%
  group_by(Continuation,TreatmentMDX,FileDate,MayoFile) %>%
  summarize(AppNum=n()) %>%
  pivot_wider(names_from=Continuation, values_from=AppNum) %>%
  rowwise() %>%
  mutate(Total=sum(c(Continuation,`New Patent`),na.rm=TRUE),
         PercNew=`New Patent`/Total)

#Now a PT dataset
AppStatusPT <- AppStatusData %>%
  filter(MayoFile=="Before")

#Share of new is parallel.
modelAppStatusPT <- lm(PercNew ~ FileDate*TreatmentMDX, 
                       data=AppStatusPT)
summary(modelAppStatusPT)
coeftest(modelAppStatusPT, vcov. = vcovHC(modelAppStatusPT, type='HC3'))

#DID?
#Not significant.
modelAppStatusDID<- lm(PercNew~ MayoFile*TreatmentMDX + as.Date(FileDate), 
                       data=AppStatusData)
summary(modelAppStatusDID)
coeftest(modelAppStatusDID, vcov. = vcovHC(modelAppStatusDID, type='HC3'))

#### HYPOTHESIS 2b: TOTAL NUMBER OF NEW APPLICATIONS ----------------

#As suggested by a pre-publication reviewer, we add an alternative specification
#that assesses new vs. continuing applications not as a percentage, but as an
#absolute number over time.

#Share of new is not parallel.
modelAppStatusPT2 <- lm(`New Patent` ~ FileDate*TreatmentMDX, 
                       data=AppStatusPT)
summary(modelAppStatusPT2)
bptest(modelAppStatusPT2)
coeftest(modelAppStatusPT2, vcov. = vcovHC(modelAppStatusPT2, type='HC3'))


#### HYPOTHESIS 3: GRANT RATES - TABLES --------------

#Grant Rates decline instantly as of resolution date.

#Tabular Generation
GrantRateData <- df2 %>%
  filter(StillPending==0 & ResolutionDate > StartDate &
           ResolutionDate < EndDate) %>%
  select(Granted,TreatmentMDX,ResolutionMonth,MayoRes,MayoResIm) %>%
  group_by(TreatmentMDX,ResolutionMonth,MayoRes,MayoResIm) %>%
  summarize(mean=mean(Granted, na.rm=TRUE)) %>%
  ungroup()

#Before/After Mayo
GrantRateDataMayo <- GrantRateData %>%
  filter(!is.na(TreatmentMDX) & !is.na(MayoRes)) %>%
  group_by(TreatmentMDX,MayoRes) %>%
  summarize(mean=mean(mean, na.rm=TRUE)) %>%
  pivot_wider(names_from=MayoRes, values_from=mean) %>%
  ungroup()

write_clip(as.matrix(unname(GrantRateDataMayo[1:2,2:3])))

#### HYPOTHESIS 3: GRANT RATES - ANALYSIS --------------

GrantRateData2 <- df2 %>%
  filter(StillPending==0 & ResolutionDate > StartDate &
           ResolutionDate < EndDate) %>%
  select(Granted,TreatmentMDX,ResolutionDate,MayoRes) %>%
  group_by(TreatmentMDX,ResolutionDate,MayoRes) %>%
  summarize(mean=mean(Granted, na.rm=TRUE)) %>%
  mutate(ResolutionDate=as.numeric(ResolutionDate)) %>%
  ungroup()


#Separate dataset for PTs  
GrantRatePT <- GrantRateData2 %>%
  filter(MayoRes=="Before")

#Parallel trends analysis
#Yes, it is parallel
modelGrantRatePT<- lm(mean ~ ResolutionDate*TreatmentMDX, 
                      data=GrantRatePT)
summary(modelGrantRatePT)
coeftest(modelGrantRatePT, vcov. = vcovHC(modelGrantRatePT, type='HC3'))

#Main Analysis
#We do see a DID effect
modelGrantRate<- lm(mean ~ MayoRes*TreatmentMDX + as.Date(ResolutionDate), 
                    data=GrantRateData2)
summary(modelGrantRate)

bptest(modelGrantRate)
coeftest(modelGrantRate, vcov. = vcovHC(modelGrantRate, type='HC3'))

#This code extracts marginal means for a table.

modelGrantRateEM <- emmeans(modelGrantRate, ~MayoRes*TreatmentMDX,
                            vcov = sandwich::vcovHC(modelGrantRate,type='HC3'))

em<-as.data.frame(modelGrantRateEM)
em2<-em %>%
  select(MayoRes,TreatmentMDX,emmean,SE) %>%
  pivot_wider(names_from=MayoRes,values_from=c(emmean,SE)) %>%
  mutate(TreatmentMDX = ifelse(TreatmentMDX==0,"Control","Treatment")) %>%
  mutate(CombB=paste0(round(emmean_Before,4)," (",round(SE_Before,4),")"),
         CombA=paste0(round(emmean_After,4)," (",round(SE_After,4),")"))

write_clip(unname(em2[1:2,6:7]))

#Interaction plot

cat_plot(modelGrantRate, pred = MayoRes, modx = TreatmentMDX, geom = "line",
         interval = FALSE,
         colors=c("black", "blue"),
         modx.labels=c("Control","Treatment"),
         legend.main="Condition:")  +
  labs(y="Adjusted Grant Rate (Daily)",
       x="Mayo Timing of Grant Date")+
  theme_classic()+
  theme(text=element_text(size=14)) +
  theme(axis.title.y=element_text(margin=margin(0,15,0,0))) +
  theme(axis.title.x=element_text(margin=margin(15,0,0,0))) +
  theme(plot.margin = margin(0.5,1,0.5,0.5, "cm")) +
  theme(text=element_text(size=14))

#Now we need to get an effect size measure
#We will use Jake Westfall's formula as lsited at:
# http://jakewestfall.org/blog/index.php/2015/05/27/follow-up-what-about-uris-2n-rule/#comment-24

#First we obtain the RMSE
RSS <- c(crossprod(modelGrantRate$residuals))
MSE <- RSS / length(modelGrantRate$residuals)
RMSE <- sqrt(MSE)

#Now we need cell means
GrantRateData4 <- GrantRateData2 %>%
  filter(!is.na(MayoRes) & !is.na(TreatmentMDX)) %>%
  group_by(TreatmentMDX,MayoRes) %>%
  summarize(mean=mean(mean, na.rm=TRUE))

A1<-GrantRateData4$mean[1]
A2<-GrantRateData4$mean[2]
B1<-GrantRateData4$mean[3]
B2<-GrantRateData4$mean[4]

#Then we can calculate Cohen's d
#small -0.09. Why such a change? Must double-check.
d=((A1-B1) - (A2-B2))/(2*RMSE)

#We're going to try it with a Cohen's d calculator for just
#the first group, which requires we get SD and N.

GRC <- GrantRateData2 %>%
  filter(!is.na(MayoRes) & !is.na(TreatmentMDX)) %>%
  group_by(TreatmentMDX,MayoRes) %>%
  summarize(mean2=mean(mean, na.rm=TRUE),
            SD=sd(mean,na.rm=TRUE),
            n=n()) %>%
  ungroup

#Using https://www.socscistatistics.com/effectsize/default3.aspx
#we see that the Cohen's D for A1/B1 is only 0.17, because of the
#massive standard deviations. So, that makes sense of it.

#### HYPOTHESIS 3: GRANT RATES - ALT ANALYSIS - MONTHS -------------- 

#What about as months?

GrantRateData3 <- df2 %>%
  filter(StillPending==0 & ResolutionDate > StartDate &
           ResolutionDate < EndDate) %>%
  select(Granted,TreatmentMDX,ResolutionMonth,MayoRes) %>%
  group_by(TreatmentMDX,ResolutionMonth,MayoRes) %>%
  summarize(mean=mean(Granted, na.rm=TRUE)) %>%
  ungroup()

GrantRatePT2 <- GrantRateData3 %>%
  filter(MayoRes=="Before")

#Parallel trends analysis
#Huh, now it's not parallel. Weird. THough it's close.
modelGrantRatePT2<- lm(mean ~ ResolutionMonth*TreatmentMDX, 
                      data=GrantRatePT2)
summary(modelGrantRatePT2)
coeftest(modelGrantRatePT2, vcov. = vcovHC(modelGrantRatePT2))
                          
#IF you run it anyway, it's significant.                
modelGrantRate2<- lm(mean ~ MayoRes*TreatmentMDX + ResolutionMonth, 
                    data=GrantRateData3)
summary(modelGrantRate2)
#Except... only borderline now.
coeftest(modelGrantRate2, vcov. = vcovHC(modelGrantRate2, type='HC3'))

RSS <- c(crossprod(modelGrantRate2$residuals))
MSE <- RSS / length(modelGrantRate2$residuals)
RMSE <- sqrt(MSE)

#Now we need cell means
GrantRateData4 <- GrantRateData3 %>%
  filter(!is.na(MayoRes) & !is.na(TreatmentMDX)) %>%
  group_by(TreatmentMDX,MayoRes) %>%
  summarize(mean=mean(mean, na.rm=TRUE))

A1<-GrantRateData4$mean[1]
A2<-GrantRateData4$mean[2]
B1<-GrantRateData4$mean[3]
B2<-GrantRateData4$mean[4]

#Then we can calculate Cohen's d
#large -0.41, interestingly.
d=((A1-B1) - (A2-B2))/(2*RMSE)

#### HYPOTHESIS 3: GRANT RATES - ALT ANALYSIS - WEEKS -------------- 

#Okay. What about doing grants by WEEK to even out all this weirdness?

#WE need the week of the current year, plus 52 weeks for every other year
#prior to the current one. So week 1 = first week of Jan, 2010
GrantRateData5 <- df2 %>%
  filter(StillPending==0 & ResolutionDate > StartDate &
           ResolutionDate < EndDate) %>%
  mutate(ResYearPlus=ResolutionYear-2010,
         ResolutionWeekYear=week(ResolutionDate),
         ResolutionWeek=ResYearPlus*52+ResolutionWeekYear) %>%
  select(Granted,TreatmentMDX,ResolutionWeek,MayoRes) %>%
  group_by(TreatmentMDX,ResolutionWeek,MayoRes) %>%
  summarize(mean=mean(Granted, na.rm=TRUE)) %>%
  ungroup()

#Separate dataset for PTs  
GrantRatePT3 <- GrantRateData5 %>%
  filter(MayoRes=="Before")

#Parallel trends analysis
#Nope, sure is not parallel
modelGrantRatePT3<- lm(mean ~ ResolutionWeek*TreatmentMDX, 
                      data=GrantRatePT3)
summary(modelGrantRatePT3)
coeftest(modelGrantRatePT3, vcov. = vcovHC(modelGrantRatePT3, type='HC3'))

#If we run the main analysis anyway...
#We do see a DID effect
modelGrantRate3<- lm(mean ~ MayoRes*TreatmentMDX + ResolutionWeek, 
                    data=GrantRateData5)
summary(modelGrantRate3)

bptest(modelGrantRate3)
coeftest(modelGrantRate3, vcov. = vcovHC(modelGrantRate3, type='HC3'))

#Interaction plot

cat_plot(modelGrantRate3, pred = MayoRes, modx = TreatmentMDX, geom = "line",
         interval = FALSE,
         colors=c("black", "blue"),
         modx.labels=c("Control","Treatment"),
         legend.main="Condition:")  +
  labs(y="Adjusted Grant Rate (Weekly)",
       x="Mayo Timing of Grant Week")+
  theme_classic()+
  theme(text=element_text(size=14)) +
  theme(axis.title.y=element_text(margin=margin(0,15,0,0))) +
  theme(axis.title.x=element_text(margin=margin(15,0,0,0))) +
  theme(plot.margin = margin(0.5,1,0.5,0.5, "cm")) +
  theme(text=element_text(size=14))


#Effect size for weekly?
RSS <- c(crossprod(modelGrantRate3$residuals))
MSE <- RSS / length(modelGrantRate3$residuals)
RMSE <- sqrt(MSE)

#Now we need cell means
GrantRateData6 <- GrantRateData5 %>%
  filter(!is.na(MayoRes) & !is.na(TreatmentMDX)) %>%
  group_by(TreatmentMDX,MayoRes) %>%
  summarize(mean=mean(mean, na.rm=TRUE))

A1<-GrantRateData6$mean[1]
A2<-GrantRateData6$mean[2]
B1<-GrantRateData6$mean[3]
B2<-GrantRateData6$mean[4]

#-0.33, evening it out between the other two.
d=((A1-B1) - (A2-B2))/(2*RMSE)

#What does weekly look like graphed?
GrantRateDataGraph2 <- df2 %>%
  filter(StillPending==0 & ResolutionDate > StartDate &
           ResolutionDate < EndDate & !is.na(TreatmentMDX)) %>%
  mutate(ResYearPlus=ResolutionYear-2010,
         ResolutionWeekYear=week(ResolutionDate),
         ResolutionWeek=ResYearPlus*52+ResolutionWeekYear) %>%
  select(Granted,TreatmentMDX,ResolutionWeek,MayoRes) %>%
  group_by(TreatmentMDX,ResolutionWeek) %>%
  summarize(mean=mean(Granted, na.rm=TRUE)) %>%
  mutate(TreatmentMDX = ifelse(TreatmentMDX==0,"Comparator (TC1600)","Treatment (Molecular Diagnostic)")) %>%
  ungroup()

#Very similar... I'm really, really surprised it's non-parallel looking at this
#graph.
GrantRateDataGraph2 %>%
  ggplot(aes(x=ResolutionWeek, y=mean, group=TreatmentMDX,color=TreatmentMDX)) +
  geom_smooth(size=1, alpha=0.2,span=0.8, se=FALSE) +
  scale_color_manual(values=c("black","blue")) +
  scale_x_continuous(breaks=c(1,104,208,312,416), expand=c(0,0)) +
  scale_y_continuous(limits=function(x){c(0,max(0.1,x))}, labels=scales::percent) +
  geom_vline(xintercept=118,color="black",linetype="dotted") +
  annotate(x=118,y=+Inf,label="Mayo    3/2012",hjust=0.45, vjust=1.5, size=4, geom="text") +
  ylab("Percentage of Patents Granted") +
  xlab("Week of Resolution") +
  #labs(title="Patent Grant Rate (By Month) Over Time")+
  theme_classic() +
  theme(text=element_text(size=16)) +
  theme(legend.position="bottom",legend.title=element_blank()) +
  theme(axis.title.y=element_text(margin=margin(0,15,0,0))) +
  theme(axis.title.x=element_text(margin=margin(15,0,0,0))) +
  theme(plot.title = element_text(margin=margin(0,0,15,0))) +
  theme(plot.margin = margin(0.5,1,0.5,0.5, "cm"))

#Let's see only pre-Mayo?
#Yes, zooming in on the pre-Mayo period, what looked like a smooth parallel
#curve does betray its irregularity.
GrantRateDataGraph2 %>%
  ggplot(aes(x=ResolutionWeek, y=mean, group=TreatmentMDX,color=TreatmentMDX)) +
  geom_smooth(size=1, alpha=0.2,span=0.8, se=FALSE) +
  scale_color_manual(values=c("black","blue")) +
  scale_x_continuous(limits=c(0,120),
                     breaks=c(10,20,30,40,50,60,70,80,90,100,110,120), expand=c(0,0)) +
  scale_y_continuous(limits=function(x){c(0,max(0.1,x))}, labels=scales::percent) +
  geom_vline(xintercept=118,color="black",linetype="dotted") +
  annotate(x=118,y=+Inf,label="Mayo    3/2012",hjust=0.45, vjust=1.5, size=4, geom="text") +
  ylab("Percentage of Patents Granted") +
  xlab("Week of Resolution") +
  #labs(title="Patent Grant Rate (By Month) Over Time")+
  theme_classic() +
  theme(text=element_text(size=16)) +
  theme(legend.position="bottom",legend.title=element_blank()) +
  theme(axis.title.y=element_text(margin=margin(0,15,0,0))) +
  theme(axis.title.x=element_text(margin=margin(15,0,0,0))) +
  theme(plot.title = element_text(margin=margin(0,0,15,0))) +
  theme(plot.margin = margin(0.5,1,0.5,0.5, "cm"))
  

#### HYPOTHESIS 3A: GRANT RATES - ALT ANALYSIS -  SMALL US COMPANIES ONLY -------------- 

#Added 10-29-2023 to investigate an additional hypothesis.
#Only difference is using df3 - aka, the data only for small, US-based companies.

GrantRateData3a <- df3 %>%
  filter(StillPending==0 & ResolutionDate > StartDate &
           ResolutionDate < EndDate) %>%
  select(Granted,TreatmentMDX,ResolutionDate,MayoRes) %>%
  group_by(TreatmentMDX,ResolutionDate,MayoRes) %>%
  summarize(mean=mean(Granted, na.rm=TRUE)) %>%
  mutate(ResolutionDate=as.numeric(ResolutionDate)) %>%
  ungroup()


#Separate dataset for PTs  
GrantRatePT3a <- GrantRateData3a %>%
  filter(MayoRes=="Before")

#Parallel trends analysis
#Yes, it is parallel
modelGrantRatePT<- lm(mean ~ ResolutionDate*TreatmentMDX, 
                      data=GrantRatePT3a)
summary(modelGrantRatePT)
coeftest(modelGrantRatePT, vcov. = vcovHC(modelGrantRatePT, type='HC3'))

#Main Analysis
#We do see a DID effect
modelGrantRate<- lm(mean ~ MayoRes*TreatmentMDX + as.Date(ResolutionDate), 
                    data=GrantRateData2)
summary(modelGrantRate)

bptest(modelGrantRate)
coeftest(modelGrantRate, vcov. = vcovHC(modelGrantRate, type='HC3'))

#This code extracts marginal means for a table.

modelGrantRateEM <- emmeans(modelGrantRate, ~MayoRes*TreatmentMDX,
                            vcov = sandwich::vcovHC(modelGrantRate,type='HC3'))

em<-as.data.frame(modelGrantRateEM)
em2<-em %>%
  select(MayoRes,TreatmentMDX,emmean,SE) %>%
  pivot_wider(names_from=MayoRes,values_from=c(emmean,SE)) %>%
  mutate(TreatmentMDX = ifelse(TreatmentMDX==0,"Control","Treatment")) %>%
  mutate(CombB=paste0(round(emmean_Before,4)," (",round(SE_Before,4),")"),
         CombA=paste0(round(emmean_After,4)," (",round(SE_After,4),")"))

write_clip(unname(em2[1:2,6:7]))

#Effect size?
RSS <- c(crossprod(modelGrantRate$residuals))
MSE <- RSS / length(modelGrantRate$residuals)
RMSE <- sqrt(MSE)

#Now we need cell means
GrantRateData3aM <- GrantRateData3 %>%
  filter(!is.na(MayoRes) & !is.na(TreatmentMDX)) %>%
  group_by(TreatmentMDX,MayoRes) %>%
  summarize(mean=mean(mean, na.rm=TRUE))

A1<-GrantRateData3aM$mean[1]
A2<-GrantRateData3aM$mean[2]
B1<-GrantRateData3aM$mean[3]
B2<-GrantRateData3aM$mean[4]

#A small -0.08
d=((A1-B1) - (A2-B2))/(2*RMSE)

#### HYPOTHESIS 4: TOTAL AND UNIQUE WORDS TABULAR -------------

#I noticed on eyeballing the data that there are some truly outre
#patents. I have excluded them for now with the 3000-word cutoff.

Claim1Words<- df2 %>%
  filter(Granted==1 & !is.na(TreatmentMDX) 
         & ResolutionDate > StartDate &
           ResolutionDate < EndDate & PubClaim1TotalWords < 3000) %>%
  select(PubClaim1TotalWords,PubClaim1UniqueWords, GrantMonth,MayoGrant,TreatmentMDX,MayoGrantIm) %>%
  group_by(GrantMonth,TreatmentMDX,MayoGrant,MayoGrantIm) %>%
  summarize(MeanWordsT=mean(PubClaim1TotalWords, na.rm=TRUE),
            MeanWordsU=mean(PubClaim1UniqueWords,na.rm=TRUE)) %>%
  ungroup()

#Before/After Mayo
#This double pivot took me nearly 30 minutes to code.
Claim1WordsMayo <- Claim1Words %>%
  filter(!is.na(TreatmentMDX) & !is.na(MayoGrant)) %>%
  group_by(TreatmentMDX,MayoGrant) %>%
  summarize(MeanWordsT=mean(MeanWordsT, na.rm=TRUE),
            MeanWordsU=mean(MeanWordsU,na.rm=TRUE)) %>%
  pivot_wider(names_from=MayoGrant, 
              values_from=c(MeanWordsT,MeanWordsU)) %>%
  pivot_longer(-c(TreatmentMDX),
               names_sep="_",
               names_to=c("variable",".value")) %>%
  arrange(variable) %>%
  ungroup()

write_clip(as.matrix(unname(Claim1WordsMayo[1:4,3:4])))

#### HYPOTHESIS 4: TOTAL/UNIQUE WORDS - ANALYSIS ------------

Claim1WordsData <- df2 %>%
  filter(Granted==1 & !is.na(TreatmentMDX) &
           GrantDate > StartDate &
           GrantDate < EndDate & PubClaim1TotalWords < 3000) %>%
  select(PubClaim1TotalWords,PubClaim1UniqueWords, GrantDate,MayoGrant,TreatmentMDX) %>%
  group_by(GrantDate,TreatmentMDX,MayoGrant) %>%
  summarize(MeanWordsT=mean(PubClaim1TotalWords, na.rm=TRUE),
            MeanWordsU=mean(PubClaim1UniqueWords,na.rm=TRUE)) %>%
  ungroup()

#Separate dataset for PTs  
Claim1WordsDataPT <- Claim1WordsData %>%
  filter(MayoGrant=="Before")

#Parallel trends analysis
#Parallel
modelClaim1WordsTPT<- lm(MeanWordsT ~ GrantDate*TreatmentMDX, 
                         data=Claim1WordsDataPT)
summary(modelClaim1WordsTPT)
coeftest(modelClaim1WordsTPT, vcov. = vcovHC(modelClaim1WordsTPT, type='HC3'))


#And parallel
modelClaim1WordsUPT<- lm(MeanWordsU ~ GrantDate*TreatmentMDX, 
                         data=Claim1WordsDataPT)
summary(modelClaim1WordsUPT)
coeftest(modelClaim1WordsUPT, vcov. = vcovHC(modelClaim1WordsUPT, type='HC3'))

#So now we do actual DID analysis.
#Yes, there's an effect
modelClaim1WordsT<- lm(MeanWordsT ~ MayoGrant*TreatmentMDX + as.Date(GrantDate), 
                       data=Claim1WordsData)
summary(modelClaim1WordsT)

bptest(modelClaim1WordsT)
coeftest(modelClaim1WordsT, vcov. = vcovHC(modelClaim1WordsT, type='HC3'))

modelClaim1WordsTEM <- emmeans(modelClaim1WordsT, ~MayoGrant*TreatmentMDX,
                               vcov = sandwich::vcovHC(modelClaim1WordsT,type='HC3'))

em<-as.data.frame(modelClaim1WordsTEM)
em2<-em %>%
  select(MayoGrant,TreatmentMDX,emmean,SE) %>%
  pivot_wider(names_from=MayoGrant,values_from=c(emmean,SE)) %>%
  mutate(TreatmentMDX = ifelse(TreatmentMDX==0,"Control","Treatment")) %>%
  mutate(TreatmentMDX = ifelse(TreatmentMDX==0,"Control","Treatment")) %>%
  mutate(CombB=paste0(round(emmean_Before,2)," (",round(SE_Before,2),")"),
         CombA=paste0(round(emmean_After,2)," (",round(SE_After,2),")"))

write_clip(unname(em2[1:2,6:7]))

#Interaction plot

cat_plot(modelClaim1WordsT, pred = MayoGrant, modx = TreatmentMDX, geom = "line",
         interval = FALSE,
         colors=c("black", "blue"),
         modx.labels=c("Control","Treatment"),
         legend.main="Condition:")  +
  labs(y="Adjusted Total Words",
       x="Mayo Timing of Grant Date")+
  theme_classic()+
  theme(text=element_text(size=14)) +
  theme(axis.title.y=element_text(margin=margin(0,15,0,0))) +
  theme(axis.title.x=element_text(margin=margin(15,0,0,0))) +
  theme(plot.margin = margin(0.5,1,0.5,0.5, "cm"))


#First we obtain the RMSE
RSS <- c(crossprod(modelClaim1WordsT$residuals))
MSE <- RSS / length(modelClaim1WordsT$residuals)
RMSE <- sqrt(MSE)

#Now we need cell means
ClaimWords <- Claim1WordsData %>%
  filter(!is.na(MayoGrant) & !is.na(TreatmentMDX)) %>%
  group_by(TreatmentMDX,MayoGrant) %>%
  summarize(mean=mean(MeanWordsT, na.rm=TRUE))

A1<-ClaimWords$mean[1]
A2<-ClaimWords$mean[2]
B1<-ClaimWords$mean[3]
B2<-ClaimWords$mean[4]

#Then we can calculate Cohen's d
#big effect, d = 0.47
d=((A1-B1) - (A2-B2))/(2*RMSE)

#What about for unique words??
modelClaim1WordsU<- lm(MeanWordsU ~ MayoGrant*TreatmentMDX, 
                       data=Claim1WordsData)
summary(modelClaim1WordsU)
coeftest(modelClaim1WordsU, vcov. = vcovHC(modelClaim1WordsU, type='HC3'))

modelClaim1WordsUEM <- emmeans(modelClaim1WordsU, ~MayoGrant*TreatmentMDX,
                               vcov = sandwich::vcovHC(modelClaim1WordsU,type='HC3'))

em<-as.data.frame(modelClaim1WordsUEM)
em2<-em %>%
  select(MayoGrant,TreatmentMDX,emmean,SE) %>%
  pivot_wider(names_from=MayoGrant,values_from=c(emmean,SE)) %>%
  mutate(TreatmentMDX = ifelse(TreatmentMDX==0,"Control","Treatment")) %>%
  mutate(TreatmentMDX = ifelse(TreatmentMDX==0,"Control","Treatment")) %>%
  mutate(CombB=paste0(round(emmean_Before,2)," (",round(SE_Before,2),")"),
         CombA=paste0(round(emmean_After,2)," (",round(SE_After,2),")"))

write_clip(unname(em2[1:2,6:7]))

cat_plot(modelClaim1WordsU, pred = MayoGrant, modx = TreatmentMDX, geom = "line",
         interval = FALSE,
         colors=c("black", "blue"),
         modx.labels=c("Control","Treatment"),
         legend.main="Condition:")  +
  labs(y="Adjusted Unique Words",
       x="Mayo Timing of Grant Date")+
  theme_classic()+
  theme(text=element_text(size=14)) +
  theme(axis.title.y=element_text(margin=margin(0,15,0,0))) +
  theme(axis.title.x=element_text(margin=margin(15,0,0,0))) +
  theme(plot.margin = margin(0.5,1,0.5,0.5, "cm"))

#First we obtain the RMSE
RSS <- c(crossprod(modelClaim1WordsU$residuals))
MSE <- RSS / length(modelClaim1WordsU$residuals)
RMSE <- sqrt(MSE)

#Now we need cell means
ClaimWords2 <- Claim1WordsData %>%
  filter(!is.na(MayoGrant) & !is.na(TreatmentMDX)) %>%
  group_by(TreatmentMDX,MayoGrant) %>%
  summarize(mean=mean(MeanWordsU, na.rm=TRUE))

A1<-ClaimWords2$mean[1]
A2<-ClaimWords2$mean[2]
B1<-ClaimWords2$mean[3]
B2<-ClaimWords2$mean[4]

#Then we can calculate Cohen's d
#huge 0.55

d=((A1-B1) - (A2-B2))/(2*RMSE)

#### HYPOTHESIS 5: TIME TO RESOLUTION - TABULAR --------------------

TimetoRes <- df2 %>%
  filter(StillPending==0 & !is.na(TreatmentMDX) & !is.na(MayoRes) &
           ResolutionDate > StartDate &
           ResolutionDate < EndDate) %>%
  select(PendencytoGrant,PendencytoAbandon,TreatmentMDX,ResolutionMonth,MayoRes,MayoResIm) %>%
  group_by(ResolutionMonth,TreatmentMDX,MayoRes,MayoResIm) %>%
  summarize(GrantP=mean(PendencytoGrant, na.rm=TRUE),
            AbandonP=mean(PendencytoAbandon,na.rm=TRUE)) %>%
  ungroup()

#How many Granted do we have? Or rather, resolved?

table(df2$StillPending)
table(df2$Granted)

TimetoResMayo <- TimetoRes %>%
  group_by(TreatmentMDX,MayoRes) %>%
  summarize(GrantP=mean(GrantP, na.rm=TRUE),
            AbandonP=mean(AbandonP,na.rm=TRUE)) %>%
  pivot_wider(names_from=MayoRes, 
              values_from=c(GrantP,AbandonP)) %>%
  pivot_longer(-c(TreatmentMDX),
               names_sep="_",
               names_to=c("variable",".value")) %>%
  arrange(desc(variable)) %>%
  ungroup()


write_clip(as.matrix(unname(TimetoResMayo[1:4,3:4])))


#### HYPOTHESIS 5: TIME TO RESOLUTION - ANALYSIS ---------------------

TimetoResData <- df2 %>%
  filter(StillPending==0 & !is.na(TreatmentMDX) &
           ResolutionDate > StartDate &
           ResolutionDate < EndDate) %>%
  select(PendencytoGrant,PendencytoAbandon,ResolutionDate,MayoRes,TreatmentMDX) %>%
  group_by(ResolutionDate,TreatmentMDX,MayoRes) %>%
  summarize(GrantP=mean(PendencytoGrant, na.rm=TRUE),
            AbandonP=mean(PendencytoAbandon,na.rm=TRUE)) %>%
  ungroup()

#Separate dataset for PTs  
TimetoResPT <- TimetoResData %>%
  filter(MayoRes=="Before")

#Parallel trends analysis
#Parallel for grants, not for abandoned
modelTimetoResGPT<- lm(GrantP ~ ResolutionDate*TreatmentMDX, 
                       data=TimetoResPT)
summary(modelTimetoResGPT)
coeftest(modelTimetoResGPT, vcov. = vcovHC(modelTimetoResGPT, type='HC3'))

modelTimetoResAPT<- lm(AbandonP ~ ResolutionDate*TreatmentMDX, 
                       data=TimetoResPT)
summary(modelTimetoResAPT)
coeftest(modelTimetoResAPT, vcov. = vcovHC(modelTimetoResAPT, type='HC3'))

#I was troubled by the vastly different sample sizes for granted and abandoned
#patents, so I conducted an investigation.

dfTest <- df2 %>%
  select(PendencytoGrant,PendencytoAbandon,ResolutionDate,MayoRes,TreatmentMDX,Abandoned,Granted) %>%
  filter(MayoRes=="Before")

#We do see equal numbers, ish, of abandoned and granted patents in the pre-Mayo period.
table(dfTest$Abandoned)
table(dfTest$Granted)

#So what causes this issue? Could it be that grants come in 'clumps' by date?
#And - yes! That is in fact what we see.
#SO we don't need to be concerned.
GrantAbandonTest <- dfTest %>%
  group_by(ResolutionDate) %>%
  summarize(GrantNum=sum(Granted),AbandonNum=sum(Abandoned)) %>%
  mutate(AnyGrant = if_else(GrantNum > 0,1,0),
         AnyAbandon = if_else(AbandonNum > 0,1,0)) %>%
  summarize(AnyGrantSum=sum(AnyGrant),AnyAbandonSum=sum(AnyAbandon)) %>%
  ungroup()


#Actual analysis for grants
#DID effect!
modelTimetoResDID<- lm(GrantP~ MayoRes*TreatmentMDX + as.Date(ResolutionDate), 
                       data=TimetoResData)
summary(modelTimetoResDID)
coeftest(modelTimetoResDID, vcov. = vcovHC(modelTimetoResDID, type='HC3'))

modelTimetoResDIDEM <- emmeans(modelTimetoResDID, ~MayoRes*TreatmentMDX,
                               vcov = sandwich::vcovHC(modelTimetoResDID,type='HC3'))

em<-as.data.frame(modelTimetoResDIDEM)
em2<-em %>%
  select(MayoRes,TreatmentMDX,emmean,SE) %>%
  pivot_wider(names_from=MayoRes,values_from=c(emmean,SE)) %>%
  mutate(TreatmentMDX = ifelse(TreatmentMDX==0,"Control","Treatment")) %>%
  mutate(CombB=paste0(round(emmean_Before,2)," (",round(SE_Before,2),")"),
         CombA=paste0(round(emmean_After,2)," (",round(SE_After,2),")"))

write_clip(unname(em2[1:2,6:7]))

#Interaction plot

cat_plot(modelTimetoResDID, pred = MayoRes, modx = TreatmentMDX, geom = "line",
         interval = FALSE,
         colors=c("black", "blue"),
         modx.labels=c("Control","Treatment"),
         legend.main="Condition:")  +
  labs(y="Adjusted Pendency to Grant",
       x="Mayo Timing of Resolution Date")+
  theme_classic()+
  theme(text=element_text(size=14)) +
  theme(axis.title.y=element_text(margin=margin(0,15,0,0))) +
  theme(axis.title.x=element_text(margin=margin(15,0,0,0))) +
  theme(plot.margin = margin(0.5,1,0.5,0.5, "cm")) +
  theme(text=element_text(size=14))

#Now we need to get an effect size measure
#We will use Jake Westfall's formula as lsited at:
# http://jakewestfall.org/blog/index.php/2015/05/27/follow-up-what-about-uris-2n-rule/#comment-24

#First we obtain the RMSE
RSS <- c(crossprod(modelTimetoResDID$residuals))
MSE <- RSS / length(modelTimetoResDID$residuals)
RMSE <- sqrt(MSE)

#Now we need cell means
TimetoResData2 <- TimetoResData %>%
  filter(!is.na(MayoRes)) %>%
  group_by(TreatmentMDX,MayoRes) %>%
  summarize(GrantP=mean(GrantP, na.rm=TRUE))

A1<-TimetoResData2$GrantP[1]
A2<-TimetoResData2$GrantP[2]
B1<-TimetoResData2$GrantP[3]
B2<-TimetoResData2$GrantP[4]

#Then we can calculate Cohen's d
#Effect size of 0.47
d=((A1-B1) - (A2-B2))/(2*RMSE)

#### HYPOTHESIS 6: APP LENGTH - TABULAR -----------------

#We need a couple tiny tweaks to variables...

df2<-df2 %>%
  mutate(PubWordCount=as.numeric(PubWordCount),
         Published=if_else(is.na(PublicationDate), "N","Y"))
         
#On eyeballing the data, we have a set of points that have unrealistically huge
#total word scores happening around April 2010 - they're all listed as having
#13000 total words and 4000 unique words! I will omit those from the dataset.

PubWord <- df2 %>%
  filter(Published=="Y" & !is.na(TreatmentMDX) &
           FileDate > StartDate & FileDate < EndDate &
           PubClaim1TotalWords < 3000) %>%
  select(PubClaim1TotalWords,PubClaim1UniqueWords,PubWordCount,
         FileMonth,MayoFile,MayoFileIm,TreatmentMDX,Continuation) %>%
  group_by(FileMonth,TreatmentMDX,Continuation,MayoFile,MayoFileIm) %>%
  summarize(Total=mean(PubClaim1TotalWords, na.rm=TRUE),
            Unique=mean(PubClaim1UniqueWords, na.rm=TRUE),
            WordC=mean(PubWordCount, na.rm=TRUE)) %>%
  pivot_wider(names_from=Continuation, values_from=c(Total,Unique,WordC)) %>%
  rowwise() %>%
  mutate(Total_Total=mean(c(Total_Continuation,`Total_New Patent`),na.rm=TRUE),
         Unique_Total=mean(c(Unique_Continuation,`Unique_New Patent`),na.rm=TRUE),
         WordC_Total=mean(c(WordC_Continuation,`WordC_New Patent`),na.rm=TRUE)) %>%
  ungroup()


#Before/After Mayo
PubWordMayo <- PubWord %>%
  filter(!is.na(TreatmentMDX) & !is.na(MayoFile)) %>%
  group_by(TreatmentMDX,MayoFile) %>%
  summarize(Total_Total=mean(Total_Total, na.rm=TRUE),
            Unique_Total=mean(Unique_Total, na.rm=TRUE),
            WordC_Total=mean(WordC_Total, na.rm=TRUE),
            Total_New=mean(`Total_New Patent`, na.rm=TRUE),
            Unique_New=mean(`Unique_New Patent`, na.rm=TRUE),
            WordC_New=mean(`WordC_New Patent`, na.rm=TRUE),
            Total_Cont=mean(Total_Continuation, na.rm=TRUE),
            Unique_Cont=mean(Unique_Continuation, na.rm=TRUE),
            WordC_Cont=mean(WordC_Continuation, na.rm=TRUE)) %>%
  pivot_wider(names_from=MayoFile, 
              values_from=c(Total_Total,Unique_Total,WordC_Total,
                            Total_New,Unique_New,WordC_New,
                            Total_Cont,Unique_Cont,WordC_Cont)) %>%
  pivot_longer(-c(TreatmentMDX),
               names_sep="_",
               names_to=c("variable","subset",".value")) %>%
  arrange(variable,desc(subset)) %>%
  ungroup()

write_clip(as.matrix(unname(PubWordMayo[1:18,4:5])))

#### HYPOTHESIS 6: APP LENGTH - ANALYSIS -------------

PubWordData <- df2 %>%
  filter(Published=="Y" & !is.na(TreatmentMDX) &
           FileDate > StartDate & FileDate < EndDate &
           PubClaim1TotalWords < 3000) %>%
  select(PubClaim1TotalWords,PubClaim1UniqueWords,PubWordCount,
         FileDate,MayoFile,MayoFileIm,TreatmentMDX,Continuation) %>%
  group_by(FileDate,TreatmentMDX,Continuation,MayoFile) %>%
  summarize(Total=mean(PubClaim1TotalWords, na.rm=TRUE),
            Unique=mean(PubClaim1UniqueWords, na.rm=TRUE),
            WordC=mean(PubWordCount, na.rm=TRUE)) %>%
  pivot_wider(names_from=Continuation, values_from=c(Total,Unique,WordC)) %>%
  rowwise() %>%
  mutate(Total_Total=mean(c(Total_Continuation,`Total_New Patent`),na.rm=TRUE),
         Unique_Total=mean(c(Unique_Continuation,`Unique_New Patent`),na.rm=TRUE),
         WordC_Total=mean(c(WordC_Continuation,`WordC_New Patent`),na.rm=TRUE)) %>%
  pivot_longer(-c(FileDate,TreatmentMDX,MayoFile),
               names_to=c("Variable","Status"),
               names_sep="_",
               values_to="number")  %>%
  ungroup()


#Parallel trends analysis
PubWordTPT <- PubWordData %>%
  filter(MayoFile=="Before" & Variable=="Total")

PubWordUPT <- PubWordData %>%
  filter(MayoFile=="Before" & Variable=="Unique")

PubWordWCPT <- PubWordData %>%
  filter(MayoFile=="Before" & Variable=="WordC")

#Actual analyses - all are parallel?

#Total Words
modelPubWordTPT<- lm(number ~ FileDate*TreatmentMDX*Status, 
                     data=PubWordTPT)
summary(modelPubWordTPT)
coeftest(modelPubWordTPT, vcov. = vcovHC(modelPubWordTPT, type='HC3'))

#Unique Words
modelPubWordUPT<- lm(number ~ FileDate*TreatmentMDX*Status, 
                     data=PubWordUPT)
summary(modelPubWordUPT)
coeftest(modelPubWordUPT, vcov. = vcovHC(modelPubWordUPT, type='HC3'))


#DiD Analysis

#NEw datasets
PubWordT <- PubWordData %>%
  filter(Variable=="Total")

PubWordU <- PubWordData %>%
  filter(Variable=="Unique")

PubWordWC <- PubWordData %>%
  filter(Variable=="WordC")

#DID for Total - yes!
modelPubWordTDID<- lm(number ~ MayoFile*TreatmentMDX*Status + as.Date(FileDate), 
                      data=PubWordT)
summary(modelPubWordTDID)

coeftest(modelPubWordTDID, vcov. = vcovHC(modelPubWordTDID, type='HC3'))

modelPubWordTDIDEM <- emmeans(modelPubWordTDID, ~MayoFile*TreatmentMDX,
                              vcov = sandwich::vcovHC(modelPubWordTDID,type='HC3'))

em<-as.data.frame(modelPubWordTDIDEM)
em2<-em %>%
  select(MayoFile,TreatmentMDX,emmean,SE) %>%
  pivot_wider(names_from=MayoFile,values_from=c(emmean,SE)) %>%
  mutate(TreatmentMDX = ifelse(TreatmentMDX==0,"Control","Treatment"))%>%
  mutate(CombB=paste0(round(emmean_Before,2)," (",round(SE_Before,2),")"),
         CombA=paste0(round(emmean_After,2)," (",round(SE_After,2),")"))

write_clip(unname(em2[1:2,6:7]))

#Simplified DID Graph
cat_plot(modelPubWordTDID, pred = MayoFile, modx = TreatmentMDX, geom = "line",
         interval = FALSE,colors=c("black", "blue"),
         modx.labels=c("Control","Treatment"),
         legend.main="Condition:") +
  labs(y="Adjusted Total Words",
       x="Mayo Timing of Filing Date")+
  theme_classic()+
  theme(text=element_text(size=14)) +
  theme(axis.title.y=element_text(margin=margin(0,15,0,0))) +
  theme(axis.title.x=element_text(margin=margin(15,0,0,0))) +
  theme(plot.margin = margin(0.5,1,0.5,0.5, "cm")) +
theme(text=element_text(size=14))

#Now we need to get an effect size measure

#First we obtain the RMSE
RSS <- c(crossprod(modelPubWordTDID$residuals))
MSE <- RSS / length(modelPubWordTDID$residuals)
RMSE <- sqrt(MSE)

#Now we need cell means
PubWordMeans <- PubWordData %>%
  filter(!is.na(MayoFile) & Variable=="Total") %>%
  group_by(TreatmentMDX,MayoFile) %>%
  summarize(Total=mean(number, na.rm=TRUE))

A1<-PubWordMeans$Total[1]
A2<-PubWordMeans$Total[2]
B1<-PubWordMeans$Total[3]
B2<-PubWordMeans$Total[4]

#Effect size of 0.15, pretty small
d=((A1-B1) - (A2-B2))/(2*RMSE)

#DID for Unique - yes!
modelPubWordUDID<- lm(number ~ MayoFile*TreatmentMDX*Status + as.Date(FileDate), 
                      data=PubWordU)
summary(modelPubWordUDID)

coeftest(modelPubWordUDID, vcov. = vcovHC(modelPubWordUDID, type='HC3'))

modelPubWordUDIDEM <- emmeans(modelPubWordUDID, ~MayoFile*TreatmentMDX,
                              vcov = sandwich::vcovHC(modelPubWordUDID,type='HC3'))

em<-as.data.frame(modelPubWordUDIDEM)
em2<-em %>%
  select(MayoFile,TreatmentMDX,emmean,SE) %>%
  pivot_wider(names_from=MayoFile,values_from=c(emmean,SE)) %>%
  mutate(TreatmentMDX = ifelse(TreatmentMDX==0,"Control","Treatment"))%>%
  mutate(CombB=paste0(round(emmean_Before,2)," (",round(SE_Before,2),")"),
         CombA=paste0(round(emmean_After,2)," (",round(SE_After,2),")"))

write_clip(unname(em2[1:2,6:7]))
#Simplified DID Graph
cat_plot(modelPubWordUDID, pred = MayoFile, modx = TreatmentMDX, geom = "line",
         interval = FALSE,colors=c("black", "blue"),
         modx.labels=c("Control","Treatment"),
         legend.main="Condition:") +
  labs(y="Adjusted Unique Words",
       x="Mayo Timing of Filing Date")+
  theme_classic()+
  theme(text=element_text(size=14)) +
  theme(axis.title.y=element_text(margin=margin(0,15,0,0))) +
  theme(axis.title.x=element_text(margin=margin(15,0,0,0))) +
  theme(plot.margin = margin(0.5,1,0.5,0.5, "cm"))
theme(text=element_text(size=14)) 

#Now we need to get an effect size measure
#We'll stick to 2-way for now

#First we obtain the RMSE
RSS <- c(crossprod(modelPubWordUDID$residuals))
MSE <- RSS / length(modelPubWordUDID$residuals)
RMSE <- sqrt(MSE)

#Now we need cell means
PubWordMeansU <- PubWordData %>%
  filter(!is.na(MayoFile) & Variable=="Unique") %>%
  group_by(TreatmentMDX,MayoFile) %>%
  summarize(Total=mean(number, na.rm=TRUE))

A1<-PubWordMeansU$Total[1]
A2<-PubWordMeansU$Total[2]
B1<-PubWordMeansU$Total[3]
B2<-PubWordMeansU$Total[4]

#SMall d of 0.16
d=((A1-B1) - (A2-B2))/(2*RMSE)

#### DESCRIPTIVE GRAPHING ----------

#### US VS INTERNATIONAL PATENTS -------------

#For this we have CN, EP, and JP data for international comparators.
#I will process each .csv as I did the prior ones - removing spaces
#and underscores from variable names, etc.

ep <- read_csv("epmeddx.csv")
jp <- read_csv("jpmeddx.csv")
cn <- read_csv("cnmeddx.csv")

ep$id <- "Europe"
jp$id <- "Japan"
cn$id <- "China"

us <- df2 %>%
  filter(TreatmentMDX==1) %>%
  select(NormalizedApplicationNumber,FileDate) %>%
  mutate(id="United States",
         AppSerialNumber=as.character(NormalizedApplicationNumber))
         
intMedDX <- bind_rows(ep,jp,cn) %>%
  select(AppSerialNumber, FileDate, id) %>%
  mutate(FileDate=mdy(FileDate)) 

allMedDx <- bind_rows(intMedDX,us) %>%
  mutate(QuarterFileDate = quarter(FileDate, type = "year.quarter")) %>%
  filter(QuarterFileDate >2008.4) %>%
  group_by(id,QuarterFileDate) %>%
  summarize(AppNum=n()) 

#How about by year?

colnames(df2)

us2 <- df2 %>%
  filter(TreatmentMDX==1) %>%
  select(NormalizedApplicationNumber,FileDate) %>%
  mutate(id="United States",
         AppSerialNumber=as.character(NormalizedApplicationNumber),
         FileYear=year(FileDate))

intMedDX2 <- bind_rows(ep,jp,cn) %>%
  select(AppSerialNumber, FileDate, id) %>%
  mutate(FileDate=mdy(FileDate)) %>%
  mutate(FileYear=year(FileDate))

allMedDx2 <- bind_rows(intMedDX2,us2) %>%
  filter(FileYear > 2008) %>%
  group_by(id,FileYear) %>%
  summarize(AppNum=n()) 

write_clip(allMedDx2)

#Now we can graph.

allMedDx %>%
  ggplot(aes(x=QuarterFileDate, y=AppNum, color=id)) +
  geom_smooth(aes(color=id),size=1, alpha=0.2,span=0.5) +
  scale_x_yearqtr(format = '%Y.%q',n=12, expand=c(0,0)) +
  geom_vline(xintercept=2012.1,color="black",linetype="dotted") +
  annotate(x=2012.1,y=+Inf,label="Mayo - 3/2012",hjust=1.1, vjust=1.5, size=3, geom="text") +
  geom_vline(xintercept=2014.1,color="black",linetype="dotted") +
  annotate(x=2014.1,y=+Inf,label="24 Months Post-Mayo",vjust=1.5, size=3, geom="text") +
  geom_vline(xintercept=2016.1,color="black",linetype="dotted") +
  annotate(x=2016.1,y=+Inf,label="48 Months Post-Mayo",hjust=-0.05,vjust=1.5,size=3, geom="text") +
  ylab("Number of Applications Filed For") +
  xlab("File Date, by Quarter") +
  labs(title="MedDX Patent Applications By Origin - 2009-2021") +
  scale_color_manual(name="Origin:",
                     values=c("purple","orange","blue","black"
                     )) +
  theme_classic() +
  theme(legend.position="bottom") +
  theme(axis.title.y=element_text(margin=margin(0,15,0,0))) +
  theme(axis.title.x=element_text(margin=margin(15,0,0,0))) +
  theme(plot.title = element_text(margin=margin(0,0,15,0))) +
  theme(text=element_text(size=14))

#How about by year?

allMedDx2 %>%
  filter(FileYear<2020 & FileYear > 2009) %>%
  ggplot(aes(x=FileYear, y=AppNum, linetype=id)) +
  geom_smooth(aes(linetype=id),color="black",size=1, alpha=0.2,span=0.8,se=FALSE) +
  geom_vline(xintercept=2012.1,color="black",linetype="dotted") +
  scale_x_continuous(breaks=c(2009,2011,2013,2015,2017,2019), expand=c(0,0))+
  scale_linetype_manual(name="Origin",values=c("solid","dotted","dashed","twodash"))+
  annotate(x=2012.1,y=+Inf,label="Mayo - 3/2012",hjust=1.1, vjust=1.5, size=4, geom="text") +
  ylab("Number of Applications Filed For") +
  xlab("File Year") +
  labs(title="Medical Diagnostic Patent Applications By Country") +
  theme_classic() +
  theme(legend.position="bottom") +
  theme(axis.title.y=element_text(margin=margin(0,15,0,0))) +
  theme(axis.title.x=element_text(margin=margin(15,0,0,0))) +
  theme(plot.title = element_text(margin=margin(0,0,15,0))) +
  theme(plot.margin = margin(0.5,1,0.5,0.5, "cm")) +
  theme(text = element_text(size=14))

#Year in color

allMedDx2 %>%
  filter(FileYear<2020 & FileYear > 2009) %>%
  ggplot(aes(x=FileYear, y=AppNum, color=id)) +
  geom_smooth(aes(color=id),size=1, alpha=0.2,span=0.8,se=FALSE) +
  geom_vline(xintercept=2012.1,color="black",linetype="dotted") +
  scale_x_continuous(breaks=c(2009,2011,2013,2015,2017,2019), expand=c(0,0))+
  annotate(x=2012.1,y=+Inf,label="Mayo - 3/2012",hjust=1.1, vjust=1.5, size=4, geom="text") +
  ylab("Number of Applications Filed For") +
  xlab("File Year") +
  scale_color_manual(name="Origin:",
                     values=c("purple","orange","blue","black"
                     )) +
  theme_classic() +
  theme(legend.position="bottom") +
  theme(axis.title.y=element_text(margin=margin(0,15,0,0))) +
  theme(axis.title.x=element_text(margin=margin(15,0,0,0))) +
  theme(plot.title = element_text(margin=margin(0,0,15,0))) +
  theme(plot.margin = margin(0.5,1,0.5,0.5, "cm")) +
  theme(text = element_text(size=14))


#Let's do a pre-Mayo graph

allMedDx %>%
  filter(QuarterFileDate < 2012.2 & QuarterFileDate > 2009.4) %>%
  ggplot(aes(x=QuarterFileDate, y=AppNum, color=id)) +
  geom_smooth(aes(color=id),size=1, alpha=0.2,span=0.6,se=F) +
  scale_x_yearqtr(format = '%Y.%q', expand=c(0,0)) +
  scale_y_continuous(limits=c(0,1000)) +
  ylab("Number of Applications Filed For") +
  xlab("File Date, by Quarter") +
  labs(title="MedDX Patent Applications By Origin - 2010-2012 Pre-Mayo") +
  scale_color_manual(name="Origin:",
                     values=c("purple","orange","blue","black"
                     )) +
  theme_classic() +
  theme(legend.position="bottom") +
  theme(axis.title.y=element_text(margin=margin(0,15,0,0))) +
  theme(axis.title.x=element_text(margin=margin(15,0,0,0))) +
  theme(plot.title = element_text(margin=margin(0,0,15,0))) +
  theme(text=element_text(size=14))

### H1: ACTIVITY GRAPHS ------------

#Then we begin to draw graphs for the direct hypotheses of the paper.
#We will need to use the H1Data because it combines the two datasets.

AppsGraph <- H1Data %>%
  select(NormalizedApplicationNumber,TreatmentMDX,FileYear) %>%
  filter(!is.na(TreatmentMDX) & !is.na(FileYear) &
           FileYear > 2009 & FileYear < 2020) %>%
  group_by(TreatmentMDX,FileYear) %>%
  summarize(AppNum=n()) %>%
  ungroup() %>%
  mutate(TreatmentMDX = ifelse(TreatmentMDX==0,"Comparator (TC1600)","Treatment (Molecular Diagnostic)"))

GrantsGraph <- H1Data %>%
  select(Granted,TreatmentMDX,GrantYear) %>%
  filter(!is.na(TreatmentMDX) & !is.na(GrantYear) & 
           GrantYear > 2009 & GrantYear < 2020) %>%
  group_by(TreatmentMDX,GrantYear) %>%
  summarize(GrantSum=sum(Granted, na.rm=TRUE)) %>%
  ungroup() %>%
  mutate(TreatmentMDX = ifelse(TreatmentMDX==0,"Comparator (TC1600)","Treatment (Molecular Diagnostic)"))


AppsGraph %>%
  ggplot(aes(x=FileYear, y=AppNum)) +
  facet_wrap(~TreatmentMDX, scales="free_y") +
  geom_smooth(size=1, alpha=0.2,span=0.8, color="black",se=FALSE) +
  scale_y_continuous(limits=function(x){c(0,max(0.1,x))}) +
  scale_x_continuous(breaks=c(2009,2011,2013,2015,2017,2019), expand=c(0,0)) +
  geom_vline(xintercept=2012.3,color="black",linetype="dotted") +
  annotate(x=2012.3,y=+Inf,label="Mayo    3/2012",hjust=0.45, vjust=1.5, size=4, geom="text") +
  ylab("Number") +
  xlab("File Year") +
  #labs(title="Applications Over Time") +
  theme_classic() +
  theme(text=element_text(size=16)) +
theme(legend.position="bottom",legend.title=element_blank()) +
  theme(axis.title.y=element_text(margin=margin(0,15,0,0))) +
  theme(axis.title.x=element_text(margin=margin(15,0,0,0))) +
  theme(plot.title = element_text(margin=margin(0,0,15,0))) +
  theme(plot.margin = margin(0.5,1,0.5,0.5, "cm"))

GrantsGraph %>%
  ggplot(aes(x=GrantYear, y=GrantSum)) +
  facet_wrap(~TreatmentMDX, scales="free_y") +
  geom_smooth(size=1, alpha=0.2,span=0.8, color="black",se=FALSE) +
  scale_color_manual(values=c("black")) +
  scale_y_continuous(limits=function(x){c(0,max(0.1,x))}) +
  scale_x_continuous(breaks=c(2009,2011,2013,2015,2017,2019), expand=c(0,0)) +
  geom_vline(xintercept=2012.3,color="black",linetype="dotted") +
  annotate(x=2012.3,y=+Inf,label="Mayo    3/2012",hjust=0.45, vjust=1.5, size=4, geom="text") +
  ylab("Number") +
  xlab("Grant Year") +
  #labs(title="Grants Over Time") +
  theme_classic() +
  theme(text=element_text(size=16)) +
  theme(legend.position="bottom",legend.title=element_blank()) +
  theme(axis.title.y=element_text(margin=margin(0,15,0,0))) +
  theme(axis.title.x=element_text(margin=margin(15,0,0,0))) +
  theme(plot.title = element_text(margin=margin(0,0,15,0))) +
  theme(plot.margin = margin(0.5,1,0.5,0.5, "cm"))


#### H1a: ACTIVITY GRAPHS BY ENTITY ------------

AppsGraph2 <- H1Data2 %>%
  select(NormalizedApplicationNumber,TreatmentMDX,FileYear) %>%
  filter(!is.na(TreatmentMDX) & !is.na(FileYear) &
           FileYear > 2009 & FileYear < 2020) %>%
  group_by(TreatmentMDX,FileYear) %>%
  summarize(AppNum=n()) %>%
  ungroup() %>%
  mutate(TreatmentMDX = ifelse(TreatmentMDX==0,"Comparator (TC1600)","Treatment (Molecular Diagnostic)"))

GrantsGraph2 <- H1Data2 %>%
  select(Granted,TreatmentMDX,GrantYear) %>%
  filter(!is.na(TreatmentMDX) & !is.na(GrantYear) & 
           GrantYear > 2009 & GrantYear < 2020) %>%
  group_by(TreatmentMDX,GrantYear) %>%
  summarize(GrantSum=sum(Granted, na.rm=TRUE)) %>%
  ungroup() %>%
  mutate(TreatmentMDX = ifelse(TreatmentMDX==0,"Comparator (TC1600)","Treatment (Molecular Diagnostic)"))


AppsGraph2 %>%
  ggplot(aes(x=FileYear, y=AppNum)) +
  facet_wrap(~TreatmentMDX, scales="free_y") +
  geom_smooth(size=1, alpha=0.2,span=0.8, color="black",se=FALSE) +
  scale_y_continuous(limits=function(x){c(0,max(0.1,x))}) +
  scale_x_continuous(breaks=c(2009,2011,2013,2015,2017,2019), expand=c(0,0)) +
  geom_vline(xintercept=2012.3,color="black",linetype="dotted") +
  annotate(x=2012.3,y=+Inf,label="Mayo    3/2012",hjust=0.45, vjust=1.5, size=4, geom="text") +
  ylab("Number") +
  xlab("File Year") +
  #labs(title="Applications Over Time") +
  theme_classic() +
  theme(text=element_text(size=16)) +
  theme(legend.position="bottom",legend.title=element_blank()) +
  theme(axis.title.y=element_text(margin=margin(0,15,0,0))) +
  theme(axis.title.x=element_text(margin=margin(15,0,0,0))) +
  theme(plot.title = element_text(margin=margin(0,0,15,0))) +
  theme(plot.margin = margin(0.5,1,0.5,0.5, "cm"))

GrantsGraph2 %>%
  ggplot(aes(x=GrantYear, y=GrantSum)) +
  facet_wrap(~TreatmentMDX, scales="free_y") +
  geom_smooth(size=1, alpha=0.2,span=0.8, color="black",se=FALSE) +
  scale_color_manual(values=c("black")) +
  scale_y_continuous(limits=function(x){c(0,max(0.1,x))}) +
  scale_x_continuous(breaks=c(2009,2011,2013,2015,2017,2019), expand=c(0,0)) +
  geom_vline(xintercept=2012.3,color="black",linetype="dotted") +
  annotate(x=2012.3,y=+Inf,label="Mayo    3/2012",hjust=0.45, vjust=1.5, size=4, geom="text") +
  ylab("Number") +
  xlab("Grant Year") +
  #labs(title="Grants Over Time") +
  theme_classic() +
  theme(text=element_text(size=16)) +
  theme(legend.position="bottom",legend.title=element_blank()) +
  theme(axis.title.y=element_text(margin=margin(0,15,0,0))) +
  theme(axis.title.x=element_text(margin=margin(15,0,0,0))) +
  theme(plot.title = element_text(margin=margin(0,0,15,0))) +
  theme(plot.margin = margin(0.5,1,0.5,0.5, "cm"))

####H1A2: WITH ONLY TC1600 DATA -----------------

TC1600only2B <- dfTC1600 %>%
  filter(GroupArtUnitNumber != "1631" & GroupArtUnitNumber != "1639" 
         & Domestic=="Y" & PEDSApplicationBusinessEntityStatusCategory=="SMALL") %>%
  select(ApplicationNumber,PublicationNumber,FilingDate,PublicationDate,GrantDate,MedDxTag) %>%
  mutate(Granted = ifelse(!is.na(GrantDate),1,0)) %>%
  rename(AppSerialNumber=ApplicationNumber,FileDate=FilingDate) 


H1Data2B <- TC1600only2B %>%
  mutate(MayoGrant = case_when(
    GrantDate < MayoDate ~ "Before",
    GrantDate > MayoDate  ~ "After",
    TRUE ~ NA_character_))  

PubMayo <- mdy("6-20-2013")
PubMayo2Years <- mdy("6-20-2015")

H1Data2B <- H1Data2B %>%
  mutate(MayoPub = case_when(
    PublicationDate < PubMayo ~ "Before",
    PublicationDate > PubMayo  ~ "After",
    TRUE ~ NA_character_))  

H1Data2B <- H1Data2B %>%
  mutate(MayoFile = case_when(
    FileDate < MayoDate ~ "Before",
    FileDate > MayoDate  ~ "After",
    TRUE ~ NA_character_))  

H1Data2B <- H1Data2B %>%
  mutate(across(c(MayoPub,MayoFile,MayoGrant,), 
                ~factor(.x, levels=c("Before","After"))))

H1Data2B <- H1Data2B %>%
  mutate(GrantMonth=as.yearmon(GrantDate),
         FileMonth=as.yearmon(FileDate),
         PubMonth=as.yearmon(PublicationDate),
         GrantYear=year(GrantDate),
         FileYear=year(FileDate),
         PubYear=year(PublicationDate)
  )

#For the new H1:
AppsGraph <- H1Data2B %>%
  select(AppSerialNumber,MedDxTag,FileYear) %>%
  filter(!is.na(MedDxTag) & !is.na(FileYear) &
           FileYear > 2009 & FileYear < 2020) %>%
  group_by(MedDxTag,FileYear) %>%
  summarize(AppNum=n()) %>%
  ungroup()

PubsGraph <- H1Data2B %>%
  select(PublicationNumber,MedDxTag,PubYear) %>%
  filter(!is.na(MedDxTag) & !is.na(PubYear) & 
           PubYear > 2009 & PubYear < 2020) %>%
  mutate(PubNum = ifelse(!is.na(PublicationNumber),1,0)) %>%
  group_by(MedDxTag,PubYear) %>%
  summarize(PubSum=sum(PubNum, na.rm=TRUE))  %>%
  ungroup()

GrantsGraph <- H1Data2B %>%
  select(Granted,MedDxTag,GrantYear) %>%
  filter(!is.na(MedDxTag) & !is.na(GrantYear) & 
           GrantYear > 2009 & GrantYear < 2020) %>%
  group_by(MedDxTag,GrantYear) %>%
  summarize(GrantSum=sum(Granted, na.rm=TRUE)) %>%
  ungroup() 

AllGraph <- cbind(AppsGraph,PubsGraph,GrantsGraph)
AllGraph <- AllGraph[,c(1:3,6,9)]

AllGraph <- AllGraph %>%
  rename(Applications=AppNum,
         Grants=GrantSum,
         Publications=PubSum) %>%
  pivot_longer(-c(MedDxTag,FileYear)) %>%
  rename(Outcome=name) %>%
  mutate(MedDxTag = ifelse(MedDxTag==0,"Comparator (TC1600)","Treatment (Molecular Diagnostic)"))


AllGraph %>%
  filter(Outcome=="Applications") %>%
  ggplot(aes(x=FileYear, y=value,group=Outcome,color=Outcome)) +
  facet_wrap(~MedDxTag, scales="free_y") +
  scale_y_continuous(limits=function(x){c(0,max(0.1,x))}) +
  geom_smooth(size=1, alpha=0.2,span=0.8, se=FALSE) +
  scale_color_manual(values=c("black")) +
  scale_x_continuous(breaks=c(2009,2011,2013,2015,2017,2019), expand=c(0,0)) +
  geom_vline(xintercept=2012.3,color="black",linetype="dotted") +
  annotate(x=2012.3,y=+Inf,label="Mayo    3/2012",hjust=0.45, vjust=1.5, size=4, geom="text") +
  ylab("Number") +
  xlab("File Year") +
  #labs(title="Applications Over Time") +
  theme_classic() +
  theme(text=element_text(size=16)) 
theme(legend.position="bottom",legend.title=element_blank()) +
  theme(axis.title.y=element_text(margin=margin(0,15,0,0))) +
  theme(axis.title.x=element_text(margin=margin(15,0,0,0))) +
  theme(plot.title = element_text(margin=margin(0,0,15,0))) +
  theme(plot.margin = margin(0.5,1,0.5,0.5, "cm"))

AllGraph %>%
  filter(Outcome=="Grants") %>%
  ggplot(aes(x=FileYear, y=value,group=Outcome,color=Outcome)) +
  facet_wrap(~MedDxTag, scales="free_y") +
  scale_y_continuous(limits=function(x){c(0,max(0.1,x))}) +
  geom_smooth(size=1, alpha=0.2,span=0.8, se=FALSE) +
  scale_color_manual(values=c("black")) +
  scale_x_continuous(breaks=c(2009,2011,2013,2015,2017,2019), expand=c(0,0)) +
  geom_vline(xintercept=2012.3,color="black",linetype="dotted") +
  annotate(x=2012.3,y=+Inf,label="Mayo    3/2012",hjust=0.45, vjust=1.5, size=4, geom="text") +
  ylab("Number") +
  xlab("Grant Year") +
  #labs(title="Grants Over Time") +
  theme_classic() +
  theme(text=element_text(size=16)) +
  theme(legend.position="bottom",legend.title=element_blank()) +
  theme(axis.title.y=element_text(margin=margin(0,15,0,0))) +
  theme(axis.title.x=element_text(margin=margin(15,0,0,0))) +
  theme(plot.title = element_text(margin=margin(0,0,15,0))) +
  theme(plot.margin = margin(0.5,1,0.5,0.5, "cm"))

####H2: PERCENTAGE OF NEW APPS GRAPHING -------

AppStatusDataG <- df2 %>%
  filter(!is.na(TreatmentMDX) & !is.na(MayoFile) &
           FileDate > StartDate & FileDate < EndDate) %>%
  select(AppSerialNumber,Continuation,TreatmentMDX,FileYear) %>%
  group_by(Continuation,TreatmentMDX,FileYear) %>%
  summarize(AppNum=n()) %>%
  pivot_wider(names_from=Continuation, values_from=AppNum) %>%
  rowwise() %>%
  mutate(Total=sum(c(Continuation,`New Patent`),na.rm=TRUE),
         PercNew=`New Patent`/Total) %>%
  mutate(TreatmentMDX = ifelse(TreatmentMDX==0,"Comparator (TC1600)","Treatment (Molecular Diagnostic)")) %>%
  ungroup()

AppStatusDataG %>%
  ggplot(aes(x=FileYear, y=PercNew, group=TreatmentMDX, color=TreatmentMDX)) +
  geom_smooth(size=1, alpha=0.2,span=0.8, se=FALSE) +
  scale_color_manual(values=c("black","blue")) +
  scale_x_continuous(breaks=c(2010,2012,2014,2016,2018), expand=c(0,0)) +
  scale_y_continuous(limits=function(x){c(0,max(0.1,x))}, labels=scales::percent) +
  geom_vline(xintercept=2012.2,color="black",linetype="dotted") +
  annotate(x=2012.2,y=+Inf,label="Mayo    3/2012",hjust=0.45, vjust=1.5, size=4, geom="text") +
  ylab("Percentage of Patents That Are New Applications") +
  xlab("File Year") +
  #labs(title="Percentage of Patent Applications That Are New Over Time")+
  theme_classic() +
  theme(text=element_text(size=16)) +
  theme(legend.position="bottom",legend.title=element_blank()) +
  theme(axis.title.y=element_text(margin=margin(0,15,0,0))) +
  theme(axis.title.x=element_text(margin=margin(15,0,0,0))) +
  theme(plot.title = element_text(margin=margin(0,0,15,0))) +
  theme(plot.margin = margin(0.5,1,0.5,0.5, "cm"))

#### H2a: PERCENTAGE OF NEW APPS GRAPHING, BY ENTITY -------

#Only change is to use df3
AppStatusDataG <- df3 %>%
  filter(!is.na(TreatmentMDX) & !is.na(MayoFile) &
           FileDate > StartDate & FileDate < EndDate) %>%
  select(AppSerialNumber,Continuation,TreatmentMDX,FileYear) %>%
  group_by(Continuation,TreatmentMDX,FileYear) %>%
  summarize(AppNum=n()) %>%
  pivot_wider(names_from=Continuation, values_from=AppNum) %>%
  rowwise() %>%
  mutate(Total=sum(c(Continuation,`New Patent`),na.rm=TRUE),
         PercNew=`New Patent`/Total) %>%
  mutate(TreatmentMDX = ifelse(TreatmentMDX==0,"Comparator (TC1600)","Treatment (Molecular Diagnostic)")) %>%
  ungroup()

AppStatusDataG %>%
  ggplot(aes(x=FileYear, y=PercNew, group=TreatmentMDX, color=TreatmentMDX)) +
  geom_smooth(size=1, alpha=0.2,span=0.8, se=FALSE) +
  scale_color_manual(values=c("black","blue")) +
  scale_x_continuous(breaks=c(2010,2012,2014,2016,2018), expand=c(0,0)) +
  scale_y_continuous(limits=function(x){c(0,max(0.1,x))}, labels=scales::percent) +
  geom_vline(xintercept=2012.2,color="black",linetype="dotted") +
  annotate(x=2012.2,y=+Inf,label="Mayo    3/2012",hjust=0.45, vjust=1.5, size=4, geom="text") +
  ylab("Percentage of Patents That Are New Applications") +
  xlab("File Year") +
  #labs(title="Percentage of Patent Applications That Are New Over Time")+
  theme_classic() +
  theme(text=element_text(size=16)) +
  theme(legend.position="bottom",legend.title=element_blank()) +
  theme(axis.title.y=element_text(margin=margin(0,15,0,0))) +
  theme(axis.title.x=element_text(margin=margin(15,0,0,0))) +
  theme(plot.title = element_text(margin=margin(0,0,15,0))) +
  theme(plot.margin = margin(0.5,1,0.5,0.5, "cm"))

#### H3: GRANT RATES GRAPHING ---------

GrantRateDataGraph <- df2 %>%
  filter(StillPending==0 & ResolutionDate > StartDate &
           ResolutionDate < EndDate & !is.na(TreatmentMDX)) %>%
  select(Granted,TreatmentMDX,ResolutionYear) %>%
  group_by(TreatmentMDX,ResolutionYear) %>%
  summarize(mean=mean(Granted, na.rm=TRUE)) %>%
  mutate(TreatmentMDX = ifelse(TreatmentMDX==0,"Comparator (TC1600)","Treatment (Molecular Diagnostic)")) %>%
  ungroup()

GrantRateDataGraph %>%
  ggplot(aes(x=ResolutionYear, y=mean, group=TreatmentMDX,color=TreatmentMDX)) +
  geom_smooth(size=1, alpha=0.2,span=0.8, se=FALSE) +
  scale_color_manual(values=c("black","blue")) +
  scale_x_continuous(breaks=c(2010,2012,2014,2016,2018), expand=c(0,0)) +
  scale_y_continuous(limits=function(x){c(0,max(0.1,x))}, labels=scales::percent) +
  geom_vline(xintercept=2012.3,color="black",linetype="dotted") +
  annotate(x=2012.3,y=+Inf,label="Mayo    3/2012",hjust=0.45, vjust=1.5, size=4, geom="text") +
  ylab("Percentage of Patents Granted") +
  xlab("Year of Resolution") +
  #labs(title="Patent Grant Rate (By Month) Over Time")+
  theme_classic() +
  theme(text=element_text(size=16)) +
  theme(legend.position="bottom",legend.title=element_blank()) +
  theme(axis.title.y=element_text(margin=margin(0,15,0,0))) +
  theme(axis.title.x=element_text(margin=margin(15,0,0,0))) +
  theme(plot.title = element_text(margin=margin(0,0,15,0))) +
  theme(plot.margin = margin(0.5,1,0.5,0.5, "cm"))

#### H4: TOTAL UNIQUE WORDS, GRAPHING -------------

Claim1WordsDataGraph <- df2 %>%
  filter(Granted==1 & !is.na(TreatmentMDX) &
           GrantDate > StartDate &
           GrantDate < EndDate & PubClaim1TotalWords < 3000) %>%
  select(PubClaim1TotalWords,PubClaim1UniqueWords,GrantYear,TreatmentMDX) %>%
  group_by(GrantYear,TreatmentMDX) %>%
  summarize('Total Words'=mean(PubClaim1TotalWords, na.rm=TRUE),
            'Unique Words'=mean(PubClaim1UniqueWords,na.rm=TRUE)) %>%
  mutate(TreatmentMDX = ifelse(TreatmentMDX==0,"Comparator (TC1600)",
                               "Treatment (Molecular Diagnostic)")) %>%
  pivot_longer(-c(TreatmentMDX,GrantYear)) %>%
  mutate(GrantYear=as.numeric(GrantYear)) %>%
  ungroup()

Claim1WordsDataGraph %>%
  ggplot(aes(x=GrantYear, y=value, group=TreatmentMDX,color=TreatmentMDX)) +
  facet_wrap(~name, scales="free_y") +
  geom_smooth(size=1, alpha=0.2,span=0.8, se=FALSE) +
  scale_color_manual(values=c("black","blue")) +
  scale_linetype_manual(values=c("solid","dashed")) +
  scale_x_continuous(breaks=c(2010,2012,2014,2016,2018), expand=c(0,0)) +
  scale_y_continuous(limits=function(x){c(0,max(0.1,x))}) +
  geom_vline(xintercept=2012.3,color="black",linetype="dotted") +
  annotate(x=2012.3,y=+Inf,label="Mayo    3/2012",hjust=0.45, vjust=1.5, size=4, geom="text") +
  ylab("Word Count") +
  xlab("Year of Granting") +
  #labs(title="Total and Unique Claim 1 Words Over Time") +
  theme_classic() +
  theme(text=element_text(size=16)) +
  theme(legend.position="bottom",legend.title=element_blank()) +
  theme(axis.title.y=element_text(margin=margin(0,15,0,0))) +
  theme(axis.title.x=element_text(margin=margin(15,0,0,0))) +
  theme(plot.title = element_text(margin=margin(0,0,15,0))) +
  theme(plot.margin = margin(0.5,1,0.5,0.5, "cm"))

#### H5: TIME TO GRANT, GRAPHING ---------------

TimetoResDataG <- df2 %>%
  filter(StillPending==0 & !is.na(TreatmentMDX) &
           ResolutionDate > StartDate &
           ResolutionDate < EndDate) %>%
  select(PendencytoGrant,PendencytoAbandon,ResolutionYear,TreatmentMDX) %>%
  group_by(ResolutionYear,TreatmentMDX) %>%
  summarize('Grant Pendency'=mean(PendencytoGrant, na.rm=TRUE),
            'Abandonment Pendency'=mean(PendencytoAbandon,na.rm=TRUE)) %>%
  mutate(TreatmentMDX = ifelse(TreatmentMDX==0,"Comparator (TC1600)",
                               "Treatment (Molecular Diagnostic)")) %>%
  pivot_longer(-c(TreatmentMDX,ResolutionYear)) %>%
  ungroup() 

TimetoResDataG %>%
  ggplot(aes(x=ResolutionYear,y=value,color=TreatmentMDX)) +
  facet_wrap(~name) +
  geom_smooth(size=1, alpha=0.2,span=0.8, se=FALSE) +
  scale_color_manual(values=c("black","blue")) +
  scale_linetype_manual(values=c("solid","dashed")) +
  scale_x_continuous(breaks=c(2010,2012,2014,2016,2018), expand=c(0,0)) +
  scale_y_continuous(limits=function(x){c(0,max(0.1,x))}) +
  geom_vline(xintercept=2012.3,color="black",linetype="dotted") +
  annotate(x=2012.3,y=+Inf,label="Mayo    3/2012",hjust=0.45, 
           vjust=1.5, size=4, geom="text") +
  ylab("Number of Days") +
  xlab("Year of Resolution") +
  # labs(title="Pendency to Grant Over Time") +
  theme_classic() +
  theme(text=element_text(size=16)) +
  theme(legend.position="bottom",legend.title=element_blank()) +
  theme(axis.title.y=element_text(margin=margin(0,15,0,0))) +
  theme(axis.title.x=element_text(margin=margin(15,0,0,0))) +
  theme(plot.title = element_text(margin=margin(0,0,15,0))) +
  theme(plot.margin = margin(0.5,1,0.5,0.5, "cm"))

#### H7, WORDS IN PUBLISHED CLAIMS by cont status GRAPHING -------------

PubWordDataG <- df2 %>%
  filter(Published=="Y" & !is.na(TreatmentMDX) &
           FileDate > StartDate & FileDate < EndDate &
           PubClaim1TotalWords < 3000) %>%
  select(PubClaim1TotalWords,PubClaim1UniqueWords,PubWordCount,
         FileYear,TreatmentMDX,Continuation) %>%
  group_by(FileYear,TreatmentMDX,Continuation) %>%
  summarize(Total=mean(PubClaim1TotalWords, na.rm=TRUE),
            Unique=mean(PubClaim1UniqueWords, na.rm=TRUE),
            WordC=mean(PubWordCount, na.rm=TRUE)) %>%
  pivot_wider(names_from=Continuation, values_from=c(Total,Unique,WordC)) %>%
  rowwise() %>%
  mutate(Total_Total=mean(c(Total_Continuation,`Total_New Patent`),na.rm=TRUE),
         Unique_Total=mean(c(Unique_Continuation,`Unique_New Patent`),na.rm=TRUE),
         WordC_Total=mean(c(WordC_Continuation,`WordC_New Patent`),na.rm=TRUE)) %>%
  pivot_longer(-c(FileYear,TreatmentMDX),
               names_to=c("Variable","Status"),
               names_sep="_",
               values_to="number")  %>%
  mutate(TreatmentMDX = ifelse(TreatmentMDX==0,"Comparator (TC1600)",
                               "Treatment (Molecular Diagnostic)"))%>%
  rename(Condition=TreatmentMDX) %>%
  mutate(Variable = case_when(
    Variable=="Total" ~ "Total Claim 1 Words",
    Variable=="Unique" ~ "Unique Claim 1 Words",
    Variable=="WordC" ~ "Word Count")) %>%
  ungroup()

PubWordDataG %>%
  ggplot(aes(x=FileYear,y=number,linetype=Condition,color=Status)) +
  facet_wrap(~Variable, scales="free_y") +
  scale_color_manual(values=c("grey20","grey50","grey80")) +
  geom_smooth(size=1, alpha=0.2,span=0.8,se=FALSE) +
  scale_x_continuous(breaks=c(2010,2012,2014,2016,2018), expand=c(0,0)) +
  geom_vline(xintercept=2012.3,color="black",linetype="dotted") +
  annotate(x=2012.3,y=+Inf,label="Mayo    3/2012",hjust=0.45, vjust=1.5, size=3, geom="text") +
  ylab("Number of Words") +
  xlab("File Year") +
  #labs(title="Published Word Counts Over Time - Split by Status") +
  theme_classic() +
  theme(text=element_text(size=16)) +
  theme(legend.position="bottom",legend.title=element_blank()) +
  theme(axis.title.y=element_text(margin=margin(0,15,0,0))) +
  theme(axis.title.x=element_text(margin=margin(15,0,0,0))) +
  theme(plot.title = element_text(margin=margin(0,0,15,0))) +
  theme(plot.margin = margin(0.5,1,0.5,0.5, "cm"))

#Only Total
PubWordDataG %>%
  filter(Status=="Total") %>%
  ggplot(aes(x=FileYear,y=number, linetype=Condition)) +
  facet_wrap(~Variable, scales="free_y") +
  scale_linetype_manual(values=c("solid","dashed")) +
  geom_smooth(size=1, alpha=0.2,span=0.8,se=FALSE,color="black") +
  scale_x_continuous(breaks=c(2010,2012,2014,2016,2018), expand=c(0,0)) +
  geom_vline(xintercept=2012.3,color="black",linetype="dotted") +
  annotate(x=2012.3,y=+Inf,label="Mayo    3/2012",hjust=0.45, vjust=1.5, size=3, geom="text") +
  ylab("Number of Words") +
  xlab("File Year") +
  #labs(title="Published Word Counts Over Time - Overall") +
  theme_classic() +
  theme(text=element_text(size=16)) +
  theme(legend.position="bottom",legend.title=element_blank()) +
  theme(axis.title.y=element_text(margin=margin(0,15,0,0))) +
  theme(axis.title.x=element_text(margin=margin(15,0,0,0))) +
  theme(plot.title = element_text(margin=margin(0,0,15,0))) +
  theme(plot.margin = margin(0.5,1,0.5,0.5, "cm"))

#### EVENT STUDY GRAPHS ------------------------

#These are similar to the above graphs but are reformatted/repatterned to
#resemble traditional event study graphs.

#Mainly, we need to refactor the X axis to years since Mayo...
#And we need to figure out a Y axis that allows the two lines to overlap.

#To do that, we're going to convert both to percentages of the initial index
#value.

##### H1 - Apps --------------------

AppsGraph3 <- H1Data %>%
  select(NormalizedApplicationNumber,TreatmentMDX,FileYear,FileMonth) %>%
  filter(!is.na(TreatmentMDX) & !is.na(FileYear) &
           FileYear > 2009 & FileYear < 2020) %>%
  group_by(TreatmentMDX,FileMonth,FileYear) %>%
  summarize(AppNum=n()) %>%
  ungroup() %>%
  group_by(TreatmentMDX) %>%
  mutate(RefValue=AppNum/first(AppNum)) %>%
  ungroup() %>%
  mutate(TreatmentMDX = ifelse(TreatmentMDX==0,"Comparator (TC1600)","Treatment (Molecular Diagnostic)"),
         GraphYear=FileYear-2012,
         GraphMonth=as.numeric(FileMonth)-as.numeric(as.yearmon("2012-03")) )

as.numeric(as.yearmon("2012-03"))

H1event <- AppsGraph3 %>%
  ggplot(aes(x=GraphMonth, y=RefValue, group=TreatmentMDX, linetype=TreatmentMDX)) +
  geom_smooth(size=1, alpha=0.2,span=0.1, color="black",se=TRUE,) +
  scale_y_continuous(labels=scales::percent, breaks=c(.75,1.00, 1.25,1.50,1.75,2.00,2.25,2.50)) +
  scale_x_continuous(breaks=c(-2,-1,0,1,2,3,4,5,6,7), expand=c(0,0)) +
  geom_vline(xintercept=0.3,color="black",linetype="dotted") +
  annotate(x=0.3,y=+Inf,label="Mayo    3/2012",hjust=0.45, vjust=1.5, size=5, geom="text", family="serif") +
  ylab("Percentage of Base Value") +
  xlab("File Year (By Mayo)") +
  labs(title="*Figure 3a.* Applications by File Year and Patent Type") +
  theme_classic() +
  theme(text=element_text(size=16, family="serif")) +
  theme(legend.position="bottom",legend.title=element_blank()) +
  theme(axis.title.y=element_text(margin=margin(0,15,0,0))) +
  theme(axis.title.x=element_text(margin=margin(15,0,0,0))) +
  theme(plot.title = element_text(margin=margin(0,0,30,0), hjust=-0.15)) +
  theme(plot.title = ggtext::element_markdown()) +
  theme(plot.margin = margin(0.5,0.5,0.5,0.5, "cm")) 

ggsave("H1event2.tiff",H1event,dpi=600,height=7,width=8)

##### H1 - GRANTS -----------------

AppsGraphG <- H1Data %>%
  select(Granted,TreatmentMDX,GrantYear,GrantMonth,FileYear) %>%
  filter(!is.na(TreatmentMDX) & !is.na(GrantYear) &
           FileYear > 2009 & FileYear < 2020 & GrantYear < 2020) %>%
  group_by(TreatmentMDX,GrantMonth,GrantYear) %>%
  summarize(GrantSum=sum(Granted, na.rm=TRUE)) %>%
  ungroup() %>%
  group_by(TreatmentMDX) %>%
  mutate(RefValue=GrantSum/GrantSum[GrantMonth=="Jun 2011"]) %>%
  ungroup() %>%
  mutate(TreatmentMDX = ifelse(TreatmentMDX==0,"Comparator (TC1600)","Treatment (Molecular Diagnostic)"),
         GraphYear=GrantYear-2012,
         GraphMonth=as.numeric(GrantMonth)-as.numeric(as.yearmon("2012-03")) )

H1eventG <- AppsGraphG %>%
  ggplot(aes(x=GraphMonth, y=RefValue, group=TreatmentMDX, linetype=TreatmentMDX)) +
  geom_smooth(size=1, alpha=0.2,span=0.1, color="black",se=TRUE,) +
  scale_y_continuous(labels=scales::percent, breaks=c(-2.5,0,2.5,5.0,7.5,10.0,12.5,15.0,17.5,20,22.5)) +
  scale_x_continuous(breaks=c(-2,-1,0,1,2,3,4,5,6,7), expand=c(0,0)) +
  geom_vline(xintercept=0.3,color="black",linetype="dotted") +
  annotate(x=0.3,y=+Inf,label="Mayo    3/2012",hjust=0.45, vjust=1.5, size=5, family="serif",geom="text") +
  ylab("Percentage of Base Value") +
  xlab("Grant Year (By Mayo)") +
  labs(title="*Figure 3b.* Grants by Grant Year and Patent Type") +
  theme_classic() +
  theme(text=element_text(size=16, family="serif")) +
  theme(legend.position="bottom",legend.title=element_blank()) +
  theme(axis.title.y=element_text(margin=margin(0,15,0,0))) +
  theme(axis.title.x=element_text(margin=margin(15,0,0,0))) +
  theme(plot.title = element_text(margin=margin(0,0,30,0),hjust=-0.15)) +
  theme(plot.title = ggtext::element_markdown()) +
  theme(plot.margin = margin(0.5,0.5,0.5,0.5, "cm"))

ggsave("H1eventG2.tiff",H1eventG,dpi=600,height=7,width=8)

##### H1A - APPS FROM SMALL US FIRMS -----------------

AppsGraph3A <- H1Data2 %>%
  select(NormalizedApplicationNumber,TreatmentMDX,FileYear,FileMonth) %>%
  filter(!is.na(TreatmentMDX) & !is.na(FileYear) &
           FileYear > 2009 & FileYear < 2020) %>%
  group_by(TreatmentMDX,FileMonth,FileYear) %>%
  summarize(AppNum=n()) %>%
  ungroup() %>%
  group_by(TreatmentMDX) %>%
  mutate(RefValue=AppNum/first(AppNum)) %>%
  ungroup() %>%
  mutate(TreatmentMDX = ifelse(TreatmentMDX==0,"Comparator (TC1600)","Treatment (Molecular Diagnostic)"),
         GraphYear=FileYear-2012,
         GraphMonth=as.numeric(FileMonth)-as.numeric(as.yearmon("2012-03")) )

H1AeventA <- AppsGraph3A %>%
  ggplot(aes(x=GraphMonth, y=RefValue, group=TreatmentMDX, linetype=TreatmentMDX)) +
  geom_smooth(size=1, alpha=0.2,span=0.1, color="black",se=TRUE,) +
  scale_y_continuous(labels=scales::percent, breaks=c(0.25,0.50,0.75,1.00, 1.25,1.50,1.75,2.00,2.25,2.5,2.75)) +
  scale_x_continuous(breaks=c(-2,-1,0,1,2,3,4,5,6,7), expand=c(0,0)) +
  geom_vline(xintercept=0.3,color="black",linetype="dotted") +
  annotate(x=0.3,y=+Inf,label="Mayo    3/2012",hjust=0.45, vjust=1.5, size=5, family="serif",geom="text") +
  ylab("Percentage of Base Value") +
  xlab("File Year (By Mayo)") +
  labs(title="*Figure 3c.* Applications from Small U.S.-Based Firms<br>by File Year and Patent Type") +
  theme_classic() +
  theme(text=element_text(size=16, family="serif")) +
  theme(legend.position="bottom",legend.title=element_blank()) +
  theme(axis.title.y=element_text(margin=margin(0,15,0,0))) +
  theme(axis.title.x=element_text(margin=margin(15,0,0,0))) +
  theme(plot.title = element_text(margin=margin(0,0,30,0),hjust=-0.15)) +
  theme(plot.title = ggtext::element_markdown()) +
  theme(plot.margin = margin(0.5,0.5,0.5,0.5, "cm"))

ggsave("H1AeventA2.tiff",H1AeventA,dpi=600,height=7,width=8)

##### H1B - GRANTS FROM SMALL US FIRMS -----------------


AppsGraphGB <- H1Data2 %>%
  select(Granted,TreatmentMDX,GrantYear,GrantMonth,FileYear) %>%
  filter(!is.na(TreatmentMDX) & !is.na(GrantYear) &
           FileYear > 2009 & FileYear < 2020 & GrantYear < 2020) %>%
  group_by(TreatmentMDX,GrantMonth,GrantYear) %>%
  summarize(GrantSum=sum(Granted, na.rm=TRUE)) %>%
  ungroup() %>%
  group_by(TreatmentMDX) %>%
  mutate(RefValue=GrantSum/GrantSum[GrantMonth=="Jun 2011"]) %>%
  ungroup() %>%
  mutate(TreatmentMDX = ifelse(TreatmentMDX==0,"Comparator (TC1600)","Treatment (Molecular Diagnostic)"),
         GraphYear=GrantYear-2012,
         GraphMonth=as.numeric(GrantMonth)-as.numeric(as.yearmon("2012-03")) )

H1eventGB <- AppsGraphGB %>%
  ggplot(aes(x=GraphMonth, y=RefValue, group=TreatmentMDX, linetype=TreatmentMDX)) +
  geom_smooth(size=1, alpha=0.2,span=0.1, color="black",se=TRUE,) +
  scale_y_continuous(labels=scales::percent, breaks=c(-2.5,0,2.5,5.0,7.5,10.0,12.5,15.0,17.5,20,22.5)) +
  scale_x_continuous(breaks=c(-2,-1,0,1,2,3,4,5,6,7), expand=c(0,0)) +
  geom_vline(xintercept=0.3,color="black",linetype="dotted") +
  annotate(x=0.3,y=+Inf,label="Mayo    3/2012",hjust=0.45, vjust=1.5, size=5, family="serif",geom="text") +
  ylab("Percentage of Base Value") +
  xlab("Grant Year (By Mayo)") +
  labs(title="*Figure 3d.* Grants to Small U.S.-Based Firms <br>by Grant Year and Patent Type") +
  theme_classic() +
  theme(text=element_text(size=16, family="serif")) +
  theme(legend.position="bottom",legend.title=element_blank()) +
  theme(axis.title.y=element_text(margin=margin(0,15,0,0))) +
  theme(axis.title.x=element_text(margin=margin(15,0,0,0))) +
  theme(plot.title = element_text(margin=margin(0,0,30,0),hjust=-0.15)) +
  theme(plot.title = ggtext::element_markdown()) +
  theme(plot.margin = margin(0.5,0.5,0.5,0.5, "cm"))

ggsave("H1eventGB2.tiff",H1eventGB,dpi=600,height=7,width=8)


#### PANEL GRAPH OF EVENT GRAPHS 1-4 ------------------------------

figureAD<-H1event+H1AeventA+H1eventG+H1eventGB+ 
         plot_layout(guides='collect') & 
         theme(legend.position='bottom',
               legend.text=element_text(size=18, family="serif"))

ggsave("figureAD.tiff",figureAD,dpi=600,height=11,width=14)

ggsave("figureAD2.png",figureAD,dpi=600,height=11,width=14)

##### H2: PERCENTAGE OF NEW APPS GRAPHING -------

AppStatusDataG2 <- df2 %>%
  filter(!is.na(TreatmentMDX) & !is.na(MayoFile) &
           FileDate > StartDate & FileDate < EndDate) %>%
  select(AppSerialNumber,Continuation,TreatmentMDX,FileYear,FileMonth) %>%
  group_by(Continuation,TreatmentMDX,FileYear,FileMonth) %>%
  summarize(AppNum=n()) %>%
  pivot_wider(names_from=Continuation, values_from=AppNum) %>%
  rowwise() %>%
  mutate(Total=sum(c(Continuation,`New Patent`),na.rm=TRUE),
         PercNew=`New Patent`/Total) %>%
  ungroup() %>%
  mutate(RefValueT=Total/first(Total),
         RefValueP=PercNew/first(PercNew)) %>%
  mutate(TreatmentMDX = ifelse(TreatmentMDX==0,"Comparator (TC1600)","Treatment (Molecular Diagnostic)"),
         GraphYear=FileYear-2012,
         GraphMonth=as.numeric(FileMonth)-as.numeric(as.yearmon("2012-03")) )

H2event <- AppStatusDataG2 %>%
  ggplot(aes(x=GraphMonth, y=PercNew, group=TreatmentMDX, linetype=TreatmentMDX)) +
  geom_smooth(size=1, alpha=0.2,span=0.1, color="black",se=TRUE,) +
  scale_y_continuous(labels=scales::percent, breaks=c(0.5, 0.55,0.6,0.65,0.7,0.75,0.8,0.85,0.9)) +
  scale_x_continuous(breaks=c(-2,-1,0,1,2,3,4,5,6,7), expand=c(0,0)) +
  geom_vline(xintercept=0.3,color="black",linetype="dotted") +
  annotate(x=0.3,y=+Inf,label="Mayo    3/2012",hjust=0.45, vjust=1.5, size=4, geom="text") +
  ylab("Percentage of Patents That Are New Applications") +
  xlab("File Year (By Mayo)") +
  #labs(title="Applications Over Time") +
  theme_classic() +
  theme(text=element_text(size=12)) +
  theme(legend.position="bottom",legend.title=element_blank()) +
  theme(axis.title.y=element_text(margin=margin(0,15,0,0))) +
  theme(axis.title.x=element_text(margin=margin(15,0,0,0))) +
  theme(plot.title = element_text(margin=margin(0,0,15,0))) +
  theme(plot.margin = margin(0.5,1,0.5,0.5, "cm"))

ggsave("H2event.png",H2event,dpi=600,height=7,width=9)


#####  H2a: PERCENTAGE OF NEW APPS GRAPHING, BY ENTITY -------

#Only change is to use df3

AppStatusDataG2A <- df3 %>%
  filter(!is.na(TreatmentMDX) & !is.na(MayoFile) &
           FileDate > StartDate & FileDate < EndDate) %>%
  select(AppSerialNumber,Continuation,TreatmentMDX,FileYear,FileMonth) %>%
  group_by(Continuation,TreatmentMDX,FileYear,FileMonth) %>%
  summarize(AppNum=n()) %>%
  pivot_wider(names_from=Continuation, values_from=AppNum) %>%
  rowwise() %>%
  mutate(Total=sum(c(Continuation,`New Patent`),na.rm=TRUE),
         PercNew=`New Patent`/Total) %>%
  ungroup() %>%
  mutate(RefValueT=Total/first(Total),
         RefValueP=PercNew/first(PercNew)) %>%
  mutate(TreatmentMDX = ifelse(TreatmentMDX==0,"Comparator (TC1600)","Treatment (Molecular Diagnostic)"),
         GraphYear=FileYear-2012,
         GraphMonth=as.numeric(FileMonth)-as.numeric(as.yearmon("2012-03")) )

H2eventA <- AppStatusDataG2A %>%
  ggplot(aes(x=GraphMonth, y=PercNew, group=TreatmentMDX, linetype=TreatmentMDX)) +
  geom_smooth(size=1, alpha=0.2,span=0.3, color="black",se=TRUE,) +
  scale_y_continuous(labels=scales::percent, breaks=c(0,2,0.25,0.3,0.35,0.4,0.45,0.5, 0.55,0.6,0.65,0.7,0.75,0.8,0.85,0.9)) +
  scale_x_continuous(breaks=c(-2,-1,0,1,2,3,4,5,6,7), expand=c(0,0)) +
  geom_vline(xintercept=0.3,color="black",linetype="dotted") +
  annotate(x=0.3,y=+Inf,label="Mayo    3/2012",hjust=0.45, vjust=1.5, size=4, geom="text") +
  ylab("Percentage of Patents That Are New Applications") +
  xlab("File Year (By Mayo)") +
  #labs(title="Applications Over Time") +
  theme_classic() +
  theme(text=element_text(size=12)) +
  theme(legend.position="bottom",legend.title=element_blank()) +
  theme(axis.title.y=element_text(margin=margin(0,15,0,0))) +
  theme(axis.title.x=element_text(margin=margin(15,0,0,0))) +
  theme(plot.title = element_text(margin=margin(0,0,15,0))) +
  theme(plot.margin = margin(0.5,1,0.5,0.5, "cm"))

ggsave("H2eventA.png",H2eventA,dpi=600,height=7,width=9)

##### H3 - Grant Rate --------------

GrantRateDataGraph2 <- df2 %>%
  filter(StillPending==0 & ResolutionDate > StartDate &
           ResolutionDate < EndDate & !is.na(TreatmentMDX)) %>%
  select(Granted,TreatmentMDX,ResolutionYear,ResolutionMonth) %>%
  group_by(TreatmentMDX,ResolutionMonth,ResolutionYear) %>%
  summarize(mean=mean(Granted, na.rm=TRUE)) %>%
  ungroup %>%
  mutate(TreatmentMDX = ifelse(TreatmentMDX==0,"Comparator (TC1600)","Treatment (Molecular Diagnostic)"),
         GraphYear=ResolutionYear-2012,
         GraphMonth=as.numeric(ResolutionMonth)-as.numeric(as.yearmon("2012-03")) )

H3event <- GrantRateDataGraph2 %>%
  ggplot(aes(x=GraphMonth, y=mean, group=TreatmentMDX,linetype=TreatmentMDX)) +
  geom_smooth(size=1, alpha=0.2,span=0.1, color="black", se=TRUE) +
  scale_x_continuous(breaks=c(-2,-1,0,1,2,3,4,5,6,7), expand=c(0,0)) +
  scale_y_continuous(labels=scales::percent, breaks=c(0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9)) +
  geom_vline(xintercept=0.3,color="black",linetype="dotted") +
  annotate(x=0.3,y=+Inf,label="Mayo    3/2012",hjust=0.45, vjust=1.5, size=5,family="serif", geom="text") +
  ylab("Percentage of Patents Granted") +
  xlab("Year of Resolution") +
  labs(title="*Figure 5a.* Percentage of Patents Granted by Year of<br>Resolution and Patent Type")+
  theme_classic() +
  theme(text=element_text(size=16, family="serif")) +
  theme(legend.position="bottom",legend.title=element_blank()) +
  theme(axis.title.y=element_text(margin=margin(0,15,0,0))) +
  theme(axis.title.x=element_text(margin=margin(15,0,0,0))) +
  theme(plot.title = element_text(margin=margin(0,0,15,0))) +
  theme(plot.title = ggtext::element_markdown()) +
  theme(plot.margin = margin(0.5,0.5,0.5,0.5, "cm"))

ggsave("H3event2.tiff",H3event,dpi=600,height=7,width=9)

##### H4 - Words ---------------------------------

Claim1WordsDataGraphE <- df2 %>%
  filter(Granted==1 & !is.na(TreatmentMDX) &
           GrantDate > StartDate &
           GrantDate < EndDate & PubClaim1TotalWords < 3000) %>%
  mutate(GrantYear=as.numeric(GrantYear)) %>%
  select(PubClaim1TotalWords,PubClaim1UniqueWords,GrantYear,GrantMonth,TreatmentMDX) %>%
  group_by(GrantYear,TreatmentMDX,GrantMonth) %>%
  summarize('Total Words'=mean(PubClaim1TotalWords, na.rm=TRUE),
            'Unique Words'=mean(PubClaim1UniqueWords,na.rm=TRUE)) %>%
  mutate(TreatmentMDX = ifelse(TreatmentMDX==0,"Comparator (TC1600)","Treatment (Molecular Diagnostic)"),
         GraphYear=GrantYear-2012,
         GraphMonth=as.numeric(GrantMonth)-as.numeric(as.yearmon("2012-03"))) %>%
  pivot_longer(-c(GrantYear,TreatmentMDX,GrantMonth,GraphYear,GraphMonth)) %>%
  ungroup()

H4Tevent <- Claim1WordsDataGraphE %>%
  filter(name=='Total Words') %>%
  ggplot(aes(x=GraphMonth, y=value, group=TreatmentMDX,linetype=TreatmentMDX)) +
  geom_smooth(size=1, alpha=0.2,span=0.1, color="black", se=TRUE) +
  scale_x_continuous(breaks=c(-2,-1,0,1,2,3,4,5,6,7), expand=c(0,0)) +
  scale_y_continuous(breaks=c(40,60,80,100,120,140,160,180)) +
  geom_vline(xintercept=0.3,color="black",linetype="dotted") +
  annotate(x=0.3,y=+Inf,label="Mayo    3/2012",hjust=0.45, vjust=1.5, size=5,family="serif", geom="text") +
  ylab("Total Word Count") +
  xlab("Year of Granting") +
  labs(title="*Figure 5c.* Total Words in Claim 1 By Year of Grant<br>and Patent Type")+
  theme_classic() +
  theme(text=element_text(size=16, family="serif")) +
  theme(legend.position="bottom",legend.title=element_blank()) +
  theme(axis.title.y=element_text(margin=margin(0,15,0,0))) +
  theme(axis.title.x=element_text(margin=margin(15,0,0,0))) +
  theme(plot.title = element_text(margin=margin(0,0,15,0))) +
  theme(plot.title = ggtext::element_markdown()) +
  theme(plot.margin = margin(0.5,0.5,0.5,0.5, "cm"))

ggsave("H4Tevent2.tiff",H4Tevent,dpi=600,height=7,width=9)

#Identical graph for unique words
H4Uevent <- Claim1WordsDataGraphE %>%
  filter(name=='Unique Words') %>%
  ggplot(aes(x=GraphMonth, y=value, group=TreatmentMDX,linetype=TreatmentMDX)) +
  geom_smooth(size=1, alpha=0.2,span=0.1, color="black", se=TRUE) +
  scale_x_continuous(breaks=c(-2,-1,0,1,2,3,4,5,6,7), expand=c(0,0)) +
  scale_y_continuous(breaks=c(40,60,80,100,120,140,160,180)) +
  geom_vline(xintercept=0.3,color="black",linetype="dotted") +
  annotate(x=0.3,y=+Inf,label="Mayo    3/2012",hjust=0.45, vjust=1.5, size=5,family="serif", geom="text") +
  ylab("Unique Word Count") +
  xlab("Year of Granting") +
  labs(title="*Figure 5d.* Unique Words in Claim 1 By Year of Grant<br>and Patent Type")+
  theme_classic() +
  theme(text=element_text(size=16,family="serif")) +
  theme(legend.position="bottom",legend.title=element_blank()) +
  theme(axis.title.y=element_text(margin=margin(0,15,0,0))) +
  theme(axis.title.x=element_text(margin=margin(15,0,0,0))) +
  theme(plot.title = element_text(margin=margin(0,0,15,0))) +
  theme(plot.title = ggtext::element_markdown()) +
  theme(plot.margin = margin(0.5,0.5,0.5,0.5, "cm"))

ggsave("H4Uevent2.tiff",H4Uevent,dpi=600,height=7,width=9)

##### H5 - PENDENCY ---------

TimetoResDataGe <- df2 %>%
  filter(StillPending==0 & !is.na(TreatmentMDX) &
           ResolutionDate > StartDate &
           ResolutionDate < EndDate) %>%
  select(PendencytoGrant,PendencytoAbandon,ResolutionYear,ResolutionMonth,TreatmentMDX) %>%
  group_by(ResolutionYear,ResolutionMonth,TreatmentMDX) %>%
  summarize('Grant Pendency'=mean(PendencytoGrant, na.rm=TRUE),
            'Abandonment Pendency'=mean(PendencytoAbandon,na.rm=TRUE)) %>%
  mutate(TreatmentMDX = ifelse(TreatmentMDX==0,"Comparator (TC1600)","Treatment (Molecular Diagnostic)"),
         GraphYear=ResolutionYear-2012,
         GraphMonth=as.numeric(ResolutionMonth)-as.numeric(as.yearmon("2012-03"))) %>%
  pivot_longer(-c(TreatmentMDX,ResolutionYear,ResolutionMonth,GraphYear,GraphMonth)) %>%
  ungroup() 


H5Aevent <- TimetoResDataGe%>%
  filter(name=='Grant Pendency') %>%
  ggplot(aes(x=GraphMonth, y=value, group=TreatmentMDX,linetype=TreatmentMDX)) +
  geom_smooth(size=1, alpha=0.2,span=0.1, color="black", se=TRUE) +
  scale_x_continuous(breaks=c(-2,-1,0,1,2,3,4,5,6,7), expand=c(0,0)) +
  geom_vline(xintercept=0.3,color="black",linetype="dotted") +
  annotate(x=0.3,y=+Inf,label="Mayo    3/2012",hjust=0.45, vjust=1.5, size=5, family="serif",geom="text") +
  ylab("Number of Days Between Filing and Grant") +
  xlab("Year of Resolution") +
  labs(title="*Figure 5b.* Pendency to Grant by Year of Resolution<br>and Patent Type")+
  theme_classic() +
  theme(text=element_text(size=16,family="serif")) +
  theme(legend.position="bottom",legend.title=element_blank()) +
  theme(axis.title.y=element_text(margin=margin(0,15,0,0))) +
  theme(axis.title.x=element_text(margin=margin(15,0,0,0))) +
  theme(plot.title = element_text(margin=margin(0,0,15,0))) +
  theme(plot.title = ggtext::element_markdown()) +
  theme(plot.margin = margin(0.5,0.5,0.5,0.5, "cm"))

ggsave("H5Aevent2.tiff",H5Aevent,dpi=600,height=7,width=9)

H5Bevent <- TimetoResDataGe%>%
  filter(name=='Abandonment Pendency') %>%
  ggplot(aes(x=GraphMonth, y=value, group=TreatmentMDX,linetype=TreatmentMDX)) +
  geom_smooth(size=1, alpha=0.2,span=0.1, color="black", se=TRUE) +
  scale_x_continuous(breaks=c(-2,-1,0,1,2,3,4,5,6,7), expand=c(0,0)) +
  geom_vline(xintercept=0.3,color="black",linetype="dotted") +
  annotate(x=0.3,y=+Inf,label="Mayo    3/2012",hjust=0.45, vjust=1.5, size=4, geom="text") +
  ylab("Number of Days Between Filing and Abandonment") +
  xlab("Year of Resolution") +
  #labs(title="Patent Grant Rate (By Month) Over Time")+
  theme_classic() +
  theme(text=element_text(size=12)) +
  theme(legend.position="bottom",legend.title=element_blank()) +
  theme(axis.title.y=element_text(margin=margin(0,15,0,0))) +
  theme(axis.title.x=element_text(margin=margin(15,0,0,0))) +
  theme(plot.title = element_text(margin=margin(0,0,15,0))) +
  theme(plot.margin = margin(0.5,1,0.5,0.5, "cm"))

ggsave("H5Bevent2.png",H5Bevent,dpi=600,height=7,width=9)

##### H7 - PUBLISHED WORDS -------------

PubWordDataG2E <- df2 %>%
  filter(Published=="Y" & !is.na(TreatmentMDX) &
           FileDate > StartDate & FileDate < EndDate &
           PubClaim1TotalWords < 3000) %>%
  select(PubClaim1TotalWords,PubClaim1UniqueWords,
         FileYear,FileMonth,TreatmentMDX) %>%
  group_by(FileYear,FileMonth,TreatmentMDX) %>%
  summarize(Total=mean(PubClaim1TotalWords, na.rm=TRUE),
            Unique=mean(PubClaim1UniqueWords, na.rm=TRUE))%>%
  mutate(TreatmentMDX = ifelse(TreatmentMDX==0,"Comparator (TC1600)","Treatment (Molecular Diagnostic)"),
         GraphYear=FileYear-2012,
         GraphMonth=as.numeric(FileMonth)-as.numeric(as.yearmon("2012-03"))) %>%
  pivot_longer(-c(TreatmentMDX, FileYear,FileMonth,GraphYear,GraphMonth)) %>%
  ungroup()

#Total words graph
Figure7 <- PubWordDataG2E %>%
  filter(name=='Total') %>%
  ggplot(aes(x=GraphMonth, y=value, group=TreatmentMDX,linetype=TreatmentMDX)) +
  geom_smooth(size=1, alpha=0.2,span=0.1, color="black", se=TRUE) +
  scale_x_continuous(breaks=c(-2,-1,0,1,2,3,4,5,6,7), expand=c(0,0)) +
  scale_y_continuous(breaks=c(40,60,80,100,120,140,160,180)) +
  geom_vline(xintercept=0.3,color="black",linetype="dotted") +
  annotate(x=0.3,y=+Inf,label="Mayo    3/2012",hjust=0.45, vjust=1.5, size=5,family="serif", geom="text") +
  ylab("Number of Total Words") +
  xlab("File Year") +
  theme_classic() +
  theme(text=element_text(size=16,family="serif")) +
  theme(legend.position="bottom",legend.title=element_blank()) +
  theme(axis.title.y=element_text(margin=margin(0,15,0,0))) +
  theme(axis.title.x=element_text(margin=margin(15,0,0,0))) +
  theme(plot.title = element_text(margin=margin(0,0,15,0))) +
  theme(plot.margin = margin(0.5,0.5,0.5,0.5, "cm"))

ggsave("Figure7.tiff",Figure7,dpi=600,height=7,width=9)
ggsave("Figure72.png",Figure7,dpi=600,height=7,width=9)

Figure8<- PubWordDataG2E %>%
  filter(name=='Unique') %>%
  ggplot(aes(x=GraphMonth, y=value, group=TreatmentMDX,linetype=TreatmentMDX)) +
  geom_smooth(size=1, alpha=0.2,span=0.1, color="black", se=TRUE) +
  scale_x_continuous(breaks=c(-2,-1,0,1,2,3,4,5,6,7), expand=c(0,0)) +
  scale_y_continuous(breaks=c(40,60,80,100,120,140,160,180)) +
  geom_vline(xintercept=0.3,color="black",linetype="dotted") +
  annotate(x=0.3,y=+Inf,label="Mayo    3/2012",hjust=0.45, vjust=1.5, size=4, geom="text") +
  ylab("Number of Unique Words") +
  xlab("File Year") +
  theme_classic() +
  theme(text=element_text(size=16,family="serif")) +
  theme(legend.position="bottom",legend.title=element_blank()) +
  theme(axis.title.y=element_text(margin=margin(0,15,0,0))) +
  theme(axis.title.x=element_text(margin=margin(15,0,0,0))) +
  theme(plot.title = element_text(margin=margin(0,0,15,0))) +
  theme(plot.margin = margin(0.5,0.5,0.5,0.5, "cm"))

ggsave("Figure8.tiff",Figure8,dpi=600,height=7,width=9)
ggsave("Figure82.png",Figure8,dpi=600,height=7,width=9)



##### Appendix 101 Rejections -------------

Rej101GraphE <- df2 %>%
  filter(!is.na(TreatmentMDX) &
           FirstOADate > StartDate & FirstOADate < EndDate) %>%
  select(Any101Rej,Perc101Rej,FirstOAYear,FirstOAMonth,TreatmentMDX) %>%
  group_by(FirstOAYear,FirstOAMonth,TreatmentMDX) %>%
  summarize(Any=mean(Any101Rej, na.rm=TRUE),
            Perc=mean(Perc101Rej,na.rm=TRUE)) %>%
  mutate(TreatmentMDX = ifelse(TreatmentMDX==0,"Comparator (TC1600)","Treatment (Molecular Diagnostic)"),
         GraphYear=FirstOAYear-2012,
         GraphMonth=as.numeric(FirstOAMonth)-as.numeric(as.yearmon("2012-03"))) %>%
  pivot_longer(-c(FirstOAYear,FirstOAMonth,GraphYear,GraphMonth,TreatmentMDX)) %>%
  ungroup()


Rej101GraphE2<- Rej101GraphE%>%
  filter(name=="Any") %>%
  ggplot(aes(x=GraphMonth, y=value, group=TreatmentMDX,linetype=TreatmentMDX)) +
  geom_smooth(size=1, alpha=0.2,span=0.1, color="black", se=TRUE) +
  scale_x_continuous(breaks=c(-2,-1,0,1,2,3,4,5,6,7), expand=c(0,0)) +
  scale_y_continuous(breaks=c(40,60,80,100,120,140,160,180)) +
  geom_vline(xintercept=0.3,color="black",linetype="dotted") +
  annotate(x=0.3,y=+Inf,label="Mayo    3/2012",hjust=0.45, vjust=1.5, size=4, geom="text") +
  ylab("Percentage of Patents with Any 101 Rejections") +
  xlab("Year of First Office Action") +
  theme_classic() +
  theme(text=element_text(size=12)) +
  theme(legend.position="bottom",legend.title=element_blank()) +
  theme(axis.title.y=element_text(margin=margin(0,15,0,0))) +
  theme(axis.title.x=element_text(margin=margin(15,0,0,0))) +
  theme(plot.title = element_text(margin=margin(0,0,15,0))) +
  theme(plot.margin = margin(0.5,1,0.5,0.5, "cm"))

ggsave("Rej101.png",Rej101GraphE2,dpi=600,height=7,width=9)



#### COMBINED PANEL GRAPH --------------------

#For the manuscript, we will attempt to make a graph that combines
#seven multiple graphs into a single panel.

#These are the graphs, with figure naming from a prior draft:

#Figure L: Applications, Small US-Based
#Figure M: Grant Rates
#Figure N: Initial Claim 1 Total Words
#Figure O: Initial Claim 1 Unique Words
#Figure P: Pendency to Grant
#Figure Q: Published Claim 1 Total Words
#Figure R: Published Claim 1 Unique Words

#Below I copy in the code that generated each graph. Please note this code
#is dependent on the proper model version being run first in each hypothesis'
#Analysis subection.

####Figure 2 Code--------------
figure2 <- cat_plot(modelAppsDID, pred = MayoFile, modx = TreatmentMDX, geom = "line",
                    lineend="round",
         interval = FALSE,
         colors=c("black","black"),
         vary.lty=TRUE,
         point.shape=TRUE,
         modx.labels=c("Control","Treatment"),
         legend.main="Condition:") +
  labs(y="Daily Applications",
       x="Filing Date")+
  theme_classic()+
  theme(axis.title.y=element_text(margin=margin(0,10,0,0))) +
  theme(axis.title.x=element_blank()) +
  theme(axis.text.x=element_text(size=12,color="black")) +
  theme(plot.margin = margin(0.5,1,0.5,0.5, "cm")) +
  theme(text=element_text(size=12,family="serif")) +
  theme(legend.position="bottom") +
  theme(legend.key.width=unit(2,'cm'))


ggsave("Figure2.tiff",figure2,dpi=600,height=4,width=4.5)
ggsave("Figure2p.png",figure2,dpi=600,height=4,width=4.5)

####Figure 6a Code------------------
figure6a <- cat_plot(modelGrantRate, pred = MayoRes, modx = TreatmentMDX, geom = "line",
         interval = FALSE,
         colors=c("black","black"),
         vary.lty=TRUE,
         point.shape=TRUE,
         modx.labels=c("Control","Treatment"),
         legend.main="Condition:")  +
  labs(y="Grant Rate (Daily)",
       x="Grant Date")+
  theme_classic()+
  theme(axis.title.y=element_text(margin=margin(0,10,0,0))) +
  theme(axis.title.x=element_blank()) +
  theme(axis.text.x=element_text(size=18,color="black")) +
  theme(plot.margin = margin(0.5,1,0.5,0.5, "cm")) +
  theme(text=element_text(size=18,family='serif')) +
  theme(legend.position="bottom") +
  theme(legend.key.width=unit(2,'cm'))

####Figure 6c Code----------------
figure6c <- cat_plot(modelClaim1WordsT, pred = MayoGrant, modx = TreatmentMDX, geom = "line",
         interval = FALSE,
         colors=c("black","black"),
         vary.lty=TRUE,
         point.shape=TRUE,
         modx.labels=c("Control","Treatment"),
         legend.main="Condition:")  +
  labs(y="Total Words",
       x="Grant Date")+
  theme_classic()+
  theme(axis.title.y=element_text(margin=margin(0,10,0,0))) +
  theme(axis.title.x=element_blank()) +
  theme(axis.text.x=element_text(size=18,color="black")) +
  theme(plot.margin = margin(0.5,1,0.5,0.5, "cm")) +
  theme(text=element_text(size=18,family='serif')) +
  theme(legend.position="bottom") +
  theme(legend.key.width=unit(2,'cm'))

####FIgure 6d Code------------
figure6d <- cat_plot(modelClaim1WordsU, pred = MayoGrant, modx = TreatmentMDX, geom = "line",
         interval = FALSE,
         colors=c("black","black"),
         vary.lty=TRUE,
         point.shape=TRUE,
         modx.labels=c("Control","Treatment"),
         legend.main="Condition:")  +
  labs(y="Unique Words",
       x="Grant Date")+
  theme_classic()+
  theme(axis.title.y=element_text(margin=margin(0,10,0,0))) +
  theme(axis.title.x=element_blank()) +
  theme(axis.text.x=element_text(size=18,color="black")) +
  theme(plot.margin = margin(0.5,1,0.5,0.5, "cm"))+
  theme(text=element_text(size=18,family="serif")) +
  theme(legend.position="bottom") +
  theme(legend.key.width=unit(2,'cm'))

####Figure 6b Code----------
figure6b <- cat_plot(modelTimetoResDID, pred = MayoRes, modx = TreatmentMDX, geom = "line",
         interval = FALSE,
         colors=c("black","black"),
         vary.lty=TRUE,
         point.shape=TRUE,
         modx.labels=c("Control","Treatment"),
         legend.main="Condition:")  +
  labs(y="Pendency to Grant",
       x="Resolution Date")+
  theme_classic()+
  theme(axis.title.y=element_text(margin=margin(0,10,0,0))) +
  theme(axis.title.x=element_blank()) +
  theme(axis.text.x=element_text(size=18,color="black")) +
  theme(plot.margin = margin(0.5,1,0.5,0.5, "cm")) +
  theme(text=element_text(size=18,family="serif")) +
  theme(legend.position="bottom") +
  theme(legend.key.width=unit(2,'cm'))


# ####Figure 9a Code-----------

figure9a <- cat_plot(modelPubWordTDID, pred = MayoFile, modx = TreatmentMDX, geom = "line",
         interval = FALSE,
         colors=c("black","black"),
         vary.lty=TRUE,
         point.shape=TRUE,
         modx.labels=c("Control","Treatment"),
         legend.main="Condition:") +
  labs(y="Total Published Words",
       x="Filing Date")+
   theme_classic()+
  theme(axis.title.y=element_text(margin=margin(0,10,0,0))) +
  theme(axis.title.x=element_blank()) +
  theme(axis.text.x=element_text(size=18,color="black")) +
  theme(plot.margin = margin(0.5,1,0.5,0.5, "cm")) +
  theme(text=element_text(size=18,family="serif")) +
  theme(legend.position="right") +
  theme(legend.key.width=unit(2,'cm'))


# ####Figure 9b Code------------
figure9b <- cat_plot(modelPubWordUDID, pred = MayoFile, modx = TreatmentMDX, geom = "line",
         interval = FALSE,
         colors=c("black","black"),
         vary.lty=TRUE,
         point.shape=TRUE,
         modx.labels=c("Control","Treatment"),
         legend.main="Condition:") +
  labs(y="Unique Published Words",
       x="Filing Date")+
  theme_classic()+
  theme(axis.title.y=element_text(margin=margin(0,10,0,0))) +
  theme(axis.title.x=element_blank()) +
  theme(axis.text.x=element_text(size=18,color="black")) +
  theme(plot.margin = margin(0.5,1,0.5,0.5, "cm")) +
  theme(text=element_text(size=18,family="serif")) +
  theme(legend.position="right") +
  theme(legend.key.width=unit(2,'cm'))

#### Combining graph panels-----------

#####Figure 6 in Total-------------

figure6a2<-figure6a+theme(legend.position="bottom")+theme(legend.key.width=unit(2,'cm'))
figure6<-figure6a2+figure6b+figure6c+figure6d+plot_layout(guides='collect') & theme(legend.position='bottom')

ggsave("Figure6a.tiff",figure6a,dpi=600,height=6,width=7)
ggsave("Figure6b.tiff",figure6b,dpi=600,height=6,width=7)
ggsave("Figure6c.tiff",figure6c,dpi=600,height=6,width=7)
ggsave("Figure6d.tiff",figure6d,dpi=600,height=6,width=7)

ggsave("Figure6.tiff",figure6,dpi=600,height=12,width=14)
ggsave("Figure62.png",figure6,dpi=600,height=12,width=14)

#####Figure 9 in Total-------------


figure9a2<-figure9a+theme(legend.position="bottom")+theme(legend.key.width=unit(2,'cm'))
figure9<-figure9a2+figure9b+plot_layout(guides='collect') & theme(legend.position='bottom')

ggsave("Figure9.tiff",figure9,dpi=600,height=6,width=13)
ggsave("Figure92.png",figure9,dpi=600,height=6,width=13)


####Single plot---------------------

#To clarify this complex graph, we will put a table of its contents inside
#the graph panel. It will also visually balance the seven plots.

panel1<-"1: H1a: Applications by Small, U.S.-Based Companies"
panel2<-"2: H3: Grant Rates (Daily)"
panel3<-"3: H4: Pendency to Grant (Number of Days)"
panel4<-"4: H5: Total Words in Claim 1 Applications"
panel5<-"5: H5: Unique Words in Claim 1 Applications"
panel6<-"6: H6: Total Words in Published Claim 1 Applications"
panel7<-"7: H7: Unique Words in Published Claim 1 Applications"

#There was almost certainly a smarter way to do this, but.
text1 <- ggplot(data = tibble(x = 0, y = 1)) +
  aes(x = x, y = y) +
  geom_textbox(
    box.color = NA,
    fill = NA,
    hjust = 0,
    vjust = 2.75,
    label = "**Panel Legend:**",
    size=5
  ) +
  geom_text(label=panel1, hjust=0, vjust=8,nudge_x=0.008,size=5) +
  geom_text(label=panel2, hjust=0, vjust=10.2,nudge_x=0.008,size=5) +
  geom_text(label=panel3, hjust=0, vjust=12.4,nudge_x=0.008,size=5) +
  geom_text(label=panel4, hjust=0, vjust=14.6,nudge_x=0.008,size=5) +
  geom_text(label=panel5, hjust=0, vjust=16.8,nudge_x=0.008,size=5) +
  geom_text(label=panel6, hjust=0, vjust=19,nudge_x=0.008,size=5) +
  geom_text(label=panel7, hjust=0, vjust=21.2,nudge_x=0.008,size=5) +
  scale_x_continuous(limits = c(0, 1), expand = c(0, 0)) +
  scale_y_continuous(limits = c(0, 1), expand = c(0, 0)) +
  coord_cartesian(clip="off")+
  theme_void() +
  theme(plot.margin = margin(0.5,1,0.5,0.5, "cm")) 

figureM2 <- figureM+theme(legend.position="bottom")
figureN2 <- figureN+theme(legend.position="bottom")
figureO2 <- figureO+theme(legend.position="bottom")
figureP2 <- figureP+theme(legend.position="bottom")
figureQ2 <- figureQ+theme(legend.position="bottom")
figureR2 <- figureR+theme(legend.position="bottom")

figureHA <- figureL+figureM2+figureP2+figureN2+figureO2+figureQ2+figureR2 + text1+
  plot_layout(guides="collect",ncol=2) & theme(legend.position="bottom")

ggsave("FigureHA.png",figureHA,dpi=600,height=18,width=14)
ggsave("FigureHA.svg",figureHA,dpi=600,height=6,width=7)

#### BOX 1 PIE CHART OF APPLICATIONS BY ENTITY --------------

#We need to calculate the share of each entity class for moleDX
#applications, 2010-2019

Box1Data <- df2 %>%
  select(FileYear,SLS2,NormalizedApplicationNumber,TreatmentMDX,AllInventorsUS) %>%
  filter(FileYear > 2009 & FileYear < 2020 & !is.na(SLS2) &
           AllInventorsUS==1 & TreatmentMDX==1) %>%
  group_by(SLS2) %>%
  summarize(AppSum=n())%>%
  mutate(Perc=AppSum/sum(AppSum)) %>%
  ungroup()

colors()

Box1Graph <- Box1Data %>%
  ggplot(aes(x=reorder(SLS2,-Perc),y=Perc, fill=reorder(SLS2,-Perc))) +
  geom_col() +
  scale_fill_manual(values=c("grey10","grey30","grey50","grey70")) +
  scale_y_continuous(expand=c(0,0),labels=scales::percent,breaks=c(0,.10,.20,.30,.40,.50,.60),
                     limits=c(0,.60))+
  geom_text(aes(label=scales::percent(round(Perc,4))),vjust=-1,family="serif") +
  labs(y="Percentage of Total",x="Likely Setting") +
  theme_classic() +
  theme(text=element_text(size=14,family="serif")) +
  theme(legend.position="bottom",legend.title=element_blank()) +
  theme(axis.title.y=element_text(margin=margin(0,15,0,0))) +
  theme(axis.title.x=element_text(margin=margin(15,0,0,0))) +
  theme(plot.title = element_text(margin=margin(0,0,15,0))) +
  theme(plot.margin = margin(0.5,1,0.5,0.5, "cm"))

ggsave("Figure1.tiff", Box1Graph,dpi=600,height=6,width=7)  
ggsave("Figure1p.png", Box1Graph,dpi=600,height=6,width=7)

#### ASSESSMENT OF INTEROUTCOME CORRELATIONS ----------------------------

#It is suggested we look at the correlations between our outcomes to make a
#stronger case...

df2corr <- df2 %>%
  select(Granted,Continuation, PubClaim1TotalWords,
         PubClaim1UniqueWords, PendencytoGrant,PendencytoAbandon)%>%
  mutate(Continuation=if_else(Continuation=='New Patent',0,1),
         PubClaim1TotalWords=as.numeric(PubClaim1TotalWords),
         PubClaim1UniqueWords=as.numeric(PubClaim1UniqueWords))

corr.test(df2corr)
  
#### APPENDICES AND ROBUSTNESS CHECKS -----------------------------------

#### TOTAL NUMBER OF OAS ANALYSIS ---------------

#After soliciting feedback on the manuscript, we received additional data from
#Dr. S. Sean Tu, giving us more fine-grained information on office actions for
#our relevant patents.

dfOA <- read_csv("medDXOAInfo.csv")

#We require only patents in our datasets...
patentIDs <- df2$NormalizedApplicationNumber

dfOA2 <- dfOA %>%
  filter(NormalizedApplicationNumber %in% patentIDs)

df2a <- inner_join(df2,dfOA2, by="NormalizedApplicationNumber")

#Now we can look at the number of OAs as a function of Mayo as well.

#We look specifically at number of OAs for all resolved patents, first

colnames(df2)

OAData1 <- df2a %>%
  filter(StillPending==0 & ResolutionDate > StartDate &
           ResolutionDate < EndDate) %>%
  select(NumberOAs,TreatmentMDX,ResolutionDate,MayoRes) %>%
  group_by(TreatmentMDX,ResolutionDate,MayoRes) %>%
  summarize(mean=mean(NumberOAs, na.rm=TRUE)) %>%
  mutate(ResolutionDate=as.numeric(ResolutionDate)) %>%
  ungroup()


#Separate dataset for PTs  
OADataPT <- OAData1 %>%
  filter(MayoRes=="Before")

#Parallel trends analysis
#Yes, it is parallel
modelOADataPT<- lm(mean ~ ResolutionDate*TreatmentMDX, 
                      data=OADataPT)
summary(modelOADataPT)
coeftest(modelOADataPT, vcov. = vcovHC(modelOADataPT, type='HC3'))

#Main Analysis
#We do not see an effect.
modelOAData<- lm(mean ~ MayoRes*TreatmentMDX + as.Date(ResolutionDate), 
                    data=OAData1)
summary(modelOAData)

bptest(modelOAData)
coeftest(modelOAData, vcov. = vcovHC(modelOAData, type='HC3'))

##### Number of OAs for Grants ---------------

OAData2 <- df2a %>%
  filter(Granted==1 & ResolutionDate > StartDate &
           ResolutionDate < EndDate) %>%
  select(NumberOAs,TreatmentMDX,ResolutionDate,MayoRes) %>%
  group_by(TreatmentMDX,ResolutionDate,MayoRes) %>%
  summarize(mean=mean(NumberOAs, na.rm=TRUE)) %>%
  mutate(ResolutionDate=as.numeric(ResolutionDate)) %>%
  ungroup()


#Separate dataset for PTs  
OAData2PT <- OAData2 %>%
  filter(MayoRes=="Before")

#Parallel trends analysis
#Yes, it is parallel
modelOAData2PT<- lm(mean ~ ResolutionDate*TreatmentMDX, 
                      data=OAData2PT)
summary(modelOAData2PT)
coeftest(modelOAData2PT, vcov. = vcovHC(modelOAData2PT, type='HC3'))

#Main Analysis
#We DO see an effect, for granted patents.
modelOAData2<- lm(mean ~ MayoRes*TreatmentMDX + as.Date(ResolutionDate), 
                    data=OAData2)
summary(modelOAData2)

bptest(modelOAData2)
coeftest(modelOAData2, vcov. = vcovHC(modelOAData2, type='HC3'))

#This code extracts marginal means for a table.

modelOAData2EM <- emmeans(modelOAData2, ~MayoRes*TreatmentMDX,
                            vcov = sandwich::vcovHC(modelOAData2,type='HC3'))

em<-as.data.frame(modelOAData2EM)
em2<-em %>%
  select(MayoRes,TreatmentMDX,emmean,SE) %>%
  pivot_wider(names_from=MayoRes,values_from=c(emmean,SE)) %>%
  mutate(TreatmentMDX = ifelse(TreatmentMDX==0,"Control","Treatment")) %>%
  mutate(CombB=paste0(round(emmean_Before,4)," (",round(SE_Before,4),")"),
         CombA=paste0(round(emmean_After,4)," (",round(SE_After,4),")"))

write_clip(unname(em2[1:2,6:7]))

#Interaction plot

FigureS <- cat_plot(modelOAData2, pred = MayoRes, modx = TreatmentMDX, geom = "line",
         interval = FALSE,
         colors=c("black","black"),
         vary.lty=TRUE,
         point.shape=TRUE,
         modx.labels=c("Control","Treatment"),
         legend.main="Condition:") +
  labs(y="Average Number of Office Actions for Granted Patents",
       x="Mayo Timing of Grant Date")+
  theme_classic()+
  theme(text=element_text(size=14)) +
  theme(axis.title.y=element_text(margin=margin(0,15,0,0))) +
  theme(axis.title.x=element_text(margin=margin(15,0,0,0))) +
  theme(plot.margin = margin(0.5,1,0.5,0.5, "cm")) +
  theme(text=element_text(size=14))


ggsave("FigureS.png",FigureS,dpi=600,height=6,width=7)

#Now we need to get an effect size measure.

#First we obtain the RMSE
RSS <- c(crossprod(modelOAData2$residuals))
MSE <- RSS / length(modelOAData2$residuals)
RMSE <- sqrt(MSE)

#Now we need cell means
OAData4 <- OAData2 %>%
  filter(!is.na(MayoRes) & !is.na(TreatmentMDX)) %>%
  group_by(TreatmentMDX,MayoRes) %>%
  summarize(mean=mean(mean, na.rm=TRUE))

A1<-OAData4$mean[1]
A2<-OAData4$mean[2]
B1<-OAData4$mean[3]
B2<-OAData4$mean[4]

#Then we can calculate Cohen's d
#Medium effect size of 0.33
d=((A1-B1) - (A2-B2))/(2*RMSE)

##### Number of OAs for Abandoned ---------------

OAData3 <- df2a %>%
  filter(Abandoned==1 & ResolutionDate > StartDate &
           ResolutionDate < EndDate) %>%
  select(NumberOAs,TreatmentMDX,ResolutionDate,MayoRes) %>%
  group_by(TreatmentMDX,ResolutionDate,MayoRes) %>%
  summarize(mean=mean(NumberOAs, na.rm=TRUE)) %>%
  mutate(ResolutionDate=as.numeric(ResolutionDate)) %>%
  ungroup()


#Separate dataset for PTs  
OAData3PT <- OAData3 %>%
  filter(MayoRes=="Before")

#Parallel trends analysis
#Yes, it is parallel
modelOAData3PT<- lm(mean ~ ResolutionDate*TreatmentMDX, 
                      data=OAData3PT)
summary(modelOAData3PT)
coeftest(modelOAData3PT, vcov. = vcovHC(modelOAData3PT, type='HC3'))

#Main Analysis
#We do not see an effect.
modelOAData3<- lm(mean ~ MayoRes*TreatmentMDX + as.Date(ResolutionDate), 
                    data=OAData3)
summary(modelOAData3)

bptest(modelOAData3)
coeftest(modelOAData3, vcov. = vcovHC(modelOAData3, type='HC3'))

#### 101 REJECTIONS ROBUSTNESS CHECK --------------------------

#This is a variant analysis where we consider number of 101 rejections as an
#alternative metric for examiner toughness.

df2 <- df2 %>%
  mutate(MayoFirstOA = case_when(
    FirstOADate < MayoDate ~ "Before",
    FirstOADate > MayoDate  ~ "After",
    TRUE ~ NA_character_))

#Parallel trends

DID101Rej <- df2 %>%
  filter(!is.na(TreatmentMDX) &
           FirstOADate > StartDate & FirstOADate < EndDate) %>%
  select(Any101Rej,Perc101Rej,FirstOADate,MayoFirstOA,TreatmentMDX) %>%
  group_by(FirstOADate,TreatmentMDX,MayoFirstOA) %>%
  summarize(Any=mean(Any101Rej, na.rm=TRUE),
            Perc=mean(Perc101Rej,na.rm=TRUE)) %>%
  ungroup()

DID101RejPT<- DID101Rej %>%
  filter(MayoFirstOA=="Before")

table(DID101RejPT$Any101Rej)

#Any101 is not parallel.
modelAny101RejPT <- lm(Any ~ FirstOADate*TreatmentMDX, 
                       data=DID101RejPT)
summary(modelAny101RejPT)
bptest(modelAny101RejPT)
coeftest(modelAny101RejPT, vcov. = vcovHC(modelAny101RejPT, type='HC3'))

#Perc101 is also not parallel. 
modelPerc101RejPT <- lm(Perc ~ FirstOADate*TreatmentMDX, 
                       data=DID101RejPT)
summary(modelPerc101RejPT)
bptest(modelPerc101RejPT)
coeftest(modelPerc101RejPT, vcov. = vcovHC(modelPerc101RejPT, type='HC3'))

##### Parallel trends graph -----------------------

Rej101Graph <- df2 %>%
  filter(!is.na(TreatmentMDX) & !is.na(MayoFirstOA) &
           FirstOADate > StartDate & FirstOADate < EndDate) %>%
  select(Any101Rej,Perc101Rej,FirstOAYear,MayoFirstOA,TreatmentMDX) %>%
  group_by(FirstOAYear,TreatmentMDX,MayoFirstOA) %>%
  summarize(Any=mean(Any101Rej, na.rm=TRUE),
            Perc=mean(Perc101Rej,na.rm=TRUE)) %>%
  mutate(TreatmentMDX = ifelse(TreatmentMDX==0,"Comparator (TC1600)","Treatment (Molecular Diagnostic)")) %>%
  ungroup()

FigureT <- Rej101Graph%>%
  ggplot(aes(x=FirstOAYear, y=Perc, group=TreatmentMDX, color=TreatmentMDX)) +
  geom_smooth(size=1, alpha=0.2,span=0.8, se=FALSE) +
  scale_color_manual(values=c("black","blue")) +
  scale_x_continuous(breaks=c(2010,2012,2014,2016,2018), expand=c(0,0)) +
  scale_y_continuous(limits=function(x){c(0,max(0.1,x))}, labels=scales::percent) +
  geom_vline(xintercept=2012.2,color="black",linetype="dotted") +
  annotate(x=2012.2,y=+Inf,label="Mayo    3/2012",hjust=0.45, vjust=1.5, size=4, geom="text") +
  ylab("Percentage of Patents with 101 Rejections") +
  xlab("Year of First Office Action") +
  theme_classic() +
  theme(text=element_text(size=16)) +
  theme(legend.position="bottom",legend.title=element_blank()) +
  theme(axis.title.y=element_text(margin=margin(0,15,0,0))) +
  theme(axis.title.x=element_text(margin=margin(15,0,0,0))) +
  theme(plot.title = element_text(margin=margin(0,0,15,0))) +
  theme(plot.margin = margin(0.5,1,0.5,0.5, "cm"))

ggsave("FigureT.png",FigureT,dpi=600,height=6,width=9)

#### AU1634 ROBUSTNESS CHECK -----------------

#This is a variant analysis where AU1634 is the treatment
#and the control is everything that's TC1600 but NOT
#1634, 1631, or 1639.

#For parallel trends..

##### AU1634 H1 ------------------

df2 <- df2 %>%
  mutate(TreatmentAU1634 = case_when(
    ArtUnit != 1634 & TC1600==1 ~ "0",
    ArtUnit == 1634 ~  "1"))

table(df2$ArtUnit)

AUOnly <- df2 %>%
  filter(TreatmentAU1634==1 & FileDate > StartDate & FileDate < EndDate) %>%
  select(TreatmentAU1634,AppSerialNumber,PublicationNumber,Granted,FileDate,PublicationDate,
         GrantDate,ArtUnit)

#ANd then ditto for the TC1600 sample

TC1600only3 <- dfTC1600 %>%
  filter(FilingDate > StartDate & FilingDate < EndDate & 
           GroupArtUnitNumber != "1631" & GroupArtUnitNumber != "1639"
         & GroupArtUnitNumber != "1634") %>%
  select(ApplicationNumber,PublicationNumber, FilingDate,PublicationDate,GrantDate,
         GroupArtUnitNumber) %>%
  mutate(Granted = ifelse(!is.na(GrantDate),1,0)) %>%
  rename(AppSerialNumber=ApplicationNumber,FileDate=FilingDate,
         ArtUnit=GroupArtUnitNumber) %>%
  mutate(TreatmentAU1634=0)

H1Data3 <- rbind(AUOnly,TC1600only3)

table(H1Data3$TreatmentAU1634,H1Data3$ArtUnit)

#Did any duplicates end up in here?
#Sure didn't - whew
dupIdH1<-H1Data3[duplicated(H1Data3$AppSerialNumber),]

#Now we need Mayos for THESE

H1Data3 <- H1Data3 %>%
  mutate(MayoGrant = case_when(
    GrantDate < MayoDate ~ "Before",
    GrantDate > MayoDate  ~ "After",
    TRUE ~ NA_character_))  

PubMayo <- mdy("6-20-2013")
PubMayo2Years <- mdy("6-20-2015")

H1Data3 <- H1Data3 %>%
  mutate(MayoPub = case_when(
    PublicationDate < PubMayo ~ "Before",
    PublicationDate > PubMayo  ~ "After",
    TRUE ~ NA_character_))  

H1Data3 <- H1Data3 %>%
  mutate(MayoFile = case_when(
    FileDate < MayoDate ~ "Before",
    FileDate > MayoDate  ~ "After",
    TRUE ~ NA_character_))  

H1Data3 <- H1Data3 %>%
  mutate(across(c(MayoPub,MayoFile,MayoGrant,), 
                ~factor(.x, levels=c("Before","After"))))

H1Data3 <- H1Data3 %>%
  mutate(GrantMonth=as.yearmon(GrantDate),
         FileMonth=as.yearmon(FileDate),
         PubMonth=as.yearmon(PublicationDate),
         GrantYear=year(GrantDate),
         FileYear=year(FileDate),
         PubYear=year(PublicationDate)
  )

#Now the analyses

AppsDID <- H1Data3 %>%
  filter(!is.na(TreatmentAU1634) & !is.na(MayoFile) &
           FileDate > StartDate & FileDate < EndDate) %>%
  select(MayoFile, AppSerialNumber,TreatmentAU1634,FileDate) %>%
  group_by(MayoFile,TreatmentAU1634,FileDate) %>%
  summarize(AppNum=n()) %>%
  ungroup()

PubsDID <- H1Data3 %>%
  filter(!is.na(TreatmentAU1634) & !is.na(MayoPub) &
           PublicationDate > StartDate & PublicationDate < EndDate) %>%
  select(MayoPub,PublicationNumber,TreatmentAU1634,PublicationDate) %>%
  mutate(PubNum = ifelse(!is.na(PublicationNumber),1,0)) %>%
  group_by(MayoPub,TreatmentAU1634,PublicationDate) %>%
  summarize(PubSum=sum(PubNum, na.rm=TRUE))  %>%
  ungroup()

GrantsDID <- H1Data3 %>%
  filter(!is.na(TreatmentAU1634) & !is.na(MayoGrant) & !is.na(MayoGrant) &
           GrantDate > StartDate & GrantDate < EndDate) %>%
  select(MayoGrant,Granted,TreatmentAU1634,GrantDate) %>%
  group_by(MayoGrant,TreatmentAU1634,GrantDate) %>%
  summarize(GrantSum=sum(Granted, na.rm=TRUE)) %>%
  ungroup() 

AppsPT <- AppsDID %>%
  filter(MayoFile=="Before")

PubsPT <- PubsDID %>%
  filter(MayoPub=="Before")

GrantsPT <- GrantsDID %>%
  filter(MayoGrant=="Before")

#Apps are parallel
modelAppsPT <- lm(AppNum ~ FileDate*TreatmentAU1634, 
                  data=AppsPT)
summary(modelAppsPT)
coeftest(modelAppsPT, vcov. = vcovHC(modelAppsPT, type='HC3'))


#GRants are NOT parallel
modelGrantsPT <- lm(GrantSum ~ GrantDate*TreatmentAU1634, 
                    data=GrantsPT)
summary(modelGrantsPT)
coeftest(modelGrantsPT, vcov. = vcovHC(modelGrantsPT, type='HC3'))

#DiD ANalysis for Apps
#No effect
modelAppsDID<- lm(AppNum ~ MayoFile*TreatmentAU1634 + as.Date(FileDate), 
                  data=AppsDID)
summary(modelAppsDID)
coeftest(modelAppsDID, vcov. = vcovHC(modelAppsDID, type='HC3'))

modelAppsDIDEM <- emmeans(modelAppsDID, ~MayoFile*TreatmentAU1634,
                          vcov = sandwich::vcovHC(modelAppsDID,type='HC3'))

em<-as.data.frame(modelAppsDIDEM)
em2<-em %>%
  select(MayoFile,TreatmentAU1634,emmean,SE) %>%
  pivot_wider(names_from=MayoFile,values_from=c(emmean,SE)) %>%
  mutate(TreatmentAU1634 = ifelse(TreatmentAU1634==0,"Control","Treatment"))
write_clip(unname(em2[1:2,2:3]))

##### AU1634 H1A ---------------

AUOnly2 <- df2 %>%
  filter(TreatmentAU1634==1 & FileDate > StartDate & FileDate < EndDate 
         & Domestic=="Y" & SimplifiedLikelySetting2=="3_likely_small_co") %>%
  select(TreatmentAU1634,AppSerialNumber,PublicationNumber,Granted,FileDate,PublicationDate,
         GrantDate) 

#ANd then ditto for the TC1600 sample

TC1600only4 <- dfTC1600 %>%
  filter(FilingDate > StartDate & FilingDate < EndDate 
         & GroupArtUnitNumber != "1631" & GroupArtUnitNumber != "1639"
         & GroupArtUnitNumber != "1634" & Domestic=="Y" & 
           SimplifiedLikelySetting2=="3_likely_small_co") %>%
  select(ApplicationNumber,PublicationNumber,FilingDate,PublicationDate,GrantDate) %>%
  mutate(Granted = ifelse(!is.na(GrantDate),1,0)) %>%
  rename(AppSerialNumber=ApplicationNumber,FileDate=FilingDate) %>%
  mutate(TreatmentAU1634=0)

H1Data4 <- rbind(AUOnly2,TC1600only4)

#Now we need to add date and Mayo data.

H1Data4 <- H1Data4 %>%
  mutate(MayoGrant = case_when(
    GrantDate < MayoDate ~ "Before",
    GrantDate > MayoDate  ~ "After",
    TRUE ~ NA_character_))  

PubMayo <- mdy("6-20-2013")
PubMayo2Years <- mdy("6-20-2015")

H1Data4 <- H1Data4 %>%
  mutate(MayoPub = case_when(
    PublicationDate < PubMayo ~ "Before",
    PublicationDate > PubMayo  ~ "After",
    TRUE ~ NA_character_))  

H1Data4 <- H1Data4 %>%
  mutate(MayoFile = case_when(
    FileDate < MayoDate ~ "Before",
    FileDate > MayoDate  ~ "After",
    TRUE ~ NA_character_))  

H1Data4 <- H1Data4 %>%
  mutate(across(c(MayoPub,MayoFile,MayoGrant,), 
                ~factor(.x, levels=c("Before","After"))))

H1Data4 <- H1Data4 %>%
  mutate(GrantMonth=as.yearmon(GrantDate),
         FileMonth=as.yearmon(FileDate),
         PubMonth=as.yearmon(PublicationDate),
         GrantYear=year(GrantDate),
         FileYear=year(FileDate),
         PubYear=year(PublicationDate)
  )

#NOw real analysis
AppsDID <- H1Data4 %>%
  filter(!is.na(TreatmentAU1634) & !is.na(MayoFile) &
           FileDate > StartDate & FileDate < EndDate) %>%
  select(MayoFile, AppSerialNumber,TreatmentAU1634,FileDate) %>%
  group_by(MayoFile,TreatmentAU1634,FileDate) %>%
  summarize(AppNum=n()) %>%
  ungroup()

PubsDID <- H1Data4 %>%
  filter(!is.na(TreatmentAU1634) & !is.na(MayoPub) &
           PublicationDate > StartDate & PublicationDate < EndDate) %>%
  select(MayoPub,PublicationNumber,TreatmentAU1634,PublicationDate) %>%
  mutate(PubNum = ifelse(!is.na(PublicationNumber),1,0)) %>%
  group_by(MayoPub,TreatmentAU1634,PublicationDate) %>%
  summarize(PubSum=sum(PubNum, na.rm=TRUE))  %>%
  ungroup()

GrantsDID <- H1Data4 %>%
  filter(!is.na(TreatmentAU1634) & !is.na(MayoGrant) & !is.na(MayoGrant) &
           GrantDate > StartDate & GrantDate < EndDate) %>%
  select(MayoGrant,Granted,TreatmentAU1634,GrantDate) %>%
  group_by(MayoGrant,TreatmentAU1634,GrantDate) %>%
  summarize(GrantSum=sum(Granted, na.rm=TRUE)) %>%
  ungroup() 

AppsPT <- AppsDID %>%
  filter(MayoFile=="Before")

PubsPT <- PubsDID %>%
  filter(MayoPub=="Before")

GrantsPT <- GrantsDID %>%
  filter(MayoGrant=="Before")

#Apps are parallel
modelAppsPT <- lm(AppNum ~ FileDate*TreatmentAU1634, 
                  data=AppsPT)
summary(modelAppsPT)
coeftest(modelAppsPT, vcov. = vcovHC(modelAppsPT, type='HC3'))

#GRants cannot be tested, too small!
modelGrantsPT <- lm(GrantSum ~ GrantDate*TreatmentAU1634, 
                    data=GrantsPT)
summary(modelGrantsPT)
coeftest(modelGrantsPT, vcov. = vcovHC(modelGrantsPT, type='HC3'))

#DiD ANalysis for Apps
#Significant!
modelAppsDID<- lm(AppNum ~ MayoFile*TreatmentAU1634 + as.Date(FileDate), 
                  data=AppsDID)
summary(modelAppsDID)
bptest(modelAppsDID)
coeftest(modelAppsDID, vcov. = vcovHC(modelAppsDID, type='HC3'))

#Graph to probe
cat_plot(modelAppsDID, pred = MayoFile, modx = TreatmentAU1634, geom = "line",
         interval = FALSE,
         colors=c("black", "blue"),
         modx.labels=c("Control","Treatment"),
         legend.main="Condition:")  +
  labs(y="Number of Applications",
       x="Mayo Timing of File Date")+
  theme_classic()+
  theme(text=element_text(size=14)) +
  theme(axis.title.y=element_text(margin=margin(0,15,0,0))) +
  theme(axis.title.x=element_text(margin=margin(15,0,0,0))) +
  theme(plot.margin = margin(0.5,1,0.5,0.5, "cm"))
theme(text=element_text(size=14))

modelAppsDIDEM <- emmeans(modelAppsDID, ~MayoFile*TreatmentAU1634,
                          vcov = sandwich::vcovHC(modelAppsDID,type='HC3'))

em<-as.data.frame(modelAppsDIDEM)
em2<-em %>%
  select(MayoFile,TreatmentAU1634,emmean) %>%
  pivot_wider(names_from=MayoFile,values_from=emmean) %>%
  mutate(TreatmentAU1634 = ifelse(TreatmentAU1634==0,"Control","Treatment"))
write_clip(unname(em2[1:2,2:3]))

#First we obtain the RMSE
RSS <- c(crossprod(modelAppsDID$residuals))
MSE <- RSS / length(modelAppsDID$residuals)
RMSE <- sqrt(MSE)

#Now we need cell means
AppsData2 <- AppsDID %>%
  filter(!is.na(MayoFile)) %>%
  group_by(TreatmentAU1634,MayoFile) %>%
  summarize(mean=mean(AppNum, na.rm=TRUE))

A1<-AppsData2$mean[1]
A2<-AppsData2$mean[2]
B1<-AppsData2$mean[3]
B2<-AppsData2$mean[4]

#Then we can calculate Cohen's d
#Effect size of -0.24
d=((A1-B1) - (A2-B2))/(2*RMSE)

##### AU1634 H2a ---------------

#Repeated for small US-only 
#Only change is using df3.

df3 <- df3 %>%
  mutate(TreatmentAU1634 = case_when(
    ArtUnit != 1634 & TC1600==1 ~ "0",
    ArtUnit == 1634 ~  "1"))

AppStatusData <- df3 %>%
  filter(!is.na(TreatmentAU1634) & !is.na(MayoFile) &
           FileDate > StartDate & FileDate < EndDate) %>%
  select(AppSerialNumber,Continuation,TreatmentAU1634,FileDate,MayoFile) %>%
  group_by(Continuation,TreatmentAU1634,FileDate,MayoFile) %>%
  summarize(AppNum=n()) %>%
  pivot_wider(names_from=Continuation, values_from=AppNum) %>%
  rowwise() %>%
  mutate(Total=sum(c(Continuation,`New Patent`),na.rm=TRUE),
         PercNew=`New Patent`/Total)

#Now a PT dataset
AppStatusPT <- AppStatusData %>%
  filter(MayoFile=="Before")

#Share of new IS parallel...
modelAppStatusPT <- lm(PercNew ~ FileDate*TreatmentAU1634, 
                       data=AppStatusPT)
summary(modelAppStatusPT)
coeftest(modelAppStatusPT, vcov. = vcovHC(modelAppStatusPT, type='HC3'))

#DiD Analysis IS significant.
modelAppStatusDID<- lm(PercNew~ MayoFile*TreatmentAU1634 + as.Date(FileDate), 
                       data=AppStatusData)
summary(modelAppStatusDID)
coeftest(modelAppStatusDID, vcov. = vcovHC(modelAppStatusDID, type='HC3'))

#We probe
cat_plot(modelAppStatusDID, pred = MayoFile, modx = TreatmentAU1634, geom = "line",
         interval = FALSE,
         colors=c("black", "blue"),
         modx.labels=c("Control","Treatment"),
         legend.main="Condition:")  +
  labs(y="Percentage of Applications That Are New",
       x="Mayo Timing of File Date")+
  theme_classic()+
  theme(text=element_text(size=14)) +
  theme(axis.title.y=element_text(margin=margin(0,15,0,0))) +
  theme(axis.title.x=element_text(margin=margin(15,0,0,0))) +
  theme(plot.margin = margin(0.5,1,0.5,0.5, "cm")) +
  theme(text=element_text(size=14))

#Now we need to get an effect size measure

#First we obtain the RMSE
RSS <- c(crossprod(modelAppStatusDID$residuals))
MSE <- RSS / length(modelAppStatusDID$residuals)
RMSE <- sqrt(MSE)

#Now we need cell means
AppStatusDataMeans <- AppStatusData %>%
  filter(!is.na(MayoFile) & !is.na(TreatmentAU1634)) %>%
  group_by(TreatmentAU1634,MayoFile) %>%
  summarize(mean=mean(PercNew, na.rm=TRUE))

A1<-AppStatusDataMeans$mean[1]
A2<-AppStatusDataMeans$mean[2]
B1<-AppStatusDataMeans$mean[3]
B2<-AppStatusDataMeans$mean[4]

#Then we can calculate Cohen's d
#small 0.13
d=((A1-B1) - (A2-B2))/(2*RMSE)

##### AU1634 H3 - GRANT RATES ---------------------

GrantRateData2 <- df2 %>%
  filter(StillPending==0 & ResolutionDate > StartDate &
           ResolutionDate < EndDate) %>%
  select(Granted,TreatmentAU1634,ResolutionDate,MayoRes) %>%
  group_by(TreatmentAU1634,ResolutionDate,MayoRes) %>%
  summarize(mean=mean(Granted, na.rm=TRUE)) %>%
  mutate(ResolutionDate=as.numeric(ResolutionDate)) %>%
  ungroup()


#Separate dataset for PTs  
GrantRatePT <- GrantRateData2 %>%
  filter(MayoRes=="Before")

#Parallel trends analysis
#Yes, it is parallel
modelGrantRatePT<- lm(mean ~ ResolutionDate*TreatmentAU1634, 
                      data=GrantRatePT)
summary(modelGrantRatePT)
coeftest(modelGrantRatePT, vcov. = vcovHC(modelGrantRatePT, type='HC3'))

#Main Analysis
#We DO see a DID effect
modelGrantRate<- lm(mean ~ MayoRes*TreatmentAU1634 + as.Date(ResolutionDate), 
                    data=GrantRateData2)
summary(modelGrantRate)

bptest(modelGrantRate)
coeftest(modelGrantRate, vcov. = vcovHC(modelGrantRate, type='HC3'))


modelGrantRateEM <- emmeans(modelGrantRate, ~MayoRes*TreatmentAU1634,
                            vcov = sandwich::vcovHC(modelGrantRate,type='HC3'))

#EMs and Graphs
em<-as.data.frame(modelGrantRateEM)
em2<-em %>%
  select(MayoRes,TreatmentAU1634,emmean,SE) %>%
  pivot_wider(names_from=MayoRes,values_from=c(emmean,SE)) %>%
  mutate(TreatmentAU1634 = ifelse(TreatmentAU1634==0,"Control","Treatment")) %>%
  mutate(CombB=paste0(round(emmean_Before,4)," (",round(SE_Before,4),")"),
         CombA=paste0(round(emmean_After,4)," (",round(SE_After,4),")"))

write_clip(unname(em2[1:2,6:7]))

#Interaction plot

cat_plot(modelGrantRate, pred = MayoRes, modx = TreatmentAU1634, geom = "line",
         interval = FALSE,
         colors=c("black", "blue"),
         modx.labels=c("Control","Treatment"),
         legend.main="Condition:")  +
  labs(y="Adjusted Grant Rate (Daily)",
       x="Mayo Timing of Grant Date")+
  theme_classic()+
  theme(text=element_text(size=14)) +
  theme(axis.title.y=element_text(margin=margin(0,15,0,0))) +
  theme(axis.title.x=element_text(margin=margin(15,0,0,0))) +
  theme(plot.margin = margin(0.5,1,0.5,0.5, "cm")) +
  theme(text=element_text(size=14))

#Now we need to get an effect size measure

#First we obtain the RMSE
RSS <- c(crossprod(modelGrantRate$residuals))
MSE <- RSS / length(modelGrantRate$residuals)
RMSE <- sqrt(MSE)

#Now we need cell means
GrantRateData4 <- GrantRateData2 %>%
  filter(!is.na(MayoRes) & !is.na(TreatmentAU1634)) %>%
  group_by(TreatmentAU1634,MayoRes) %>%
  summarize(mean=mean(mean, na.rm=TRUE))

A1<-GrantRateData4$mean[1]
A2<-GrantRateData4$mean[2]
B1<-GrantRateData4$mean[3]
B2<-GrantRateData4$mean[4]

#Then we can calculate Cohen's d
#small -0.10. 
d=((A1-B1) - (A2-B2))/(2*RMSE)

##### AU1634 H3 - GRANT RATES, ALT WEEKLY ------------

GrantRateData7 <- df2 %>%
  filter(StillPending==0 & ResolutionDate > StartDate &
           ResolutionDate < EndDate) %>%
  mutate(ResYearPlus=ResolutionYear-2010,
         ResolutionWeekYear=week(ResolutionDate),
         ResolutionWeek=ResYearPlus*52+ResolutionWeekYear) %>%
  select(Granted,TreatmentAU1634,ResolutionWeek,MayoRes) %>%
  group_by(TreatmentAU1634,ResolutionWeek,MayoRes) %>%
  summarize(mean=mean(Granted, na.rm=TRUE)) %>%
  ungroup()

#Separate dataset for PTs  
GrantRatePT4 <- GrantRateData7 %>%
  filter(MayoRes=="Before")

#Parallel trends analysis
#Nope, sure is not parallel
modelGrantRatePT4<- lm(mean ~ ResolutionWeek*TreatmentAU1634, 
                       data=GrantRatePT4)
summary(modelGrantRatePT4)
coeftest(modelGrantRatePT4, vcov. = vcovHC(modelGrantRatePT4, type='HC3'))

#Main Analysis
#We do see a DID effect, marginally
modelGrantRate4<- lm(mean ~ MayoRes*TreatmentAU1634 + ResolutionWeek, 
                     data=GrantRateData7)
summary(modelGrantRate4)

bptest(modelGrantRate4)
coeftest(modelGrantRate4, vcov. = vcovHC(modelGrantRate4, type='HC3'))

#Interaction plot

cat_plot(modelGrantRate4, pred = MayoRes, modx = TreatmentAU1634, geom = "line",
         interval = FALSE,
         colors=c("black", "blue"),
         modx.labels=c("Control","Treatment"),
         legend.main="Condition:")  +
  labs(y="Adjusted Grant Rate (Weekly)",
       x="Mayo Timing of Grant Week")+
  theme_classic()+
  theme(text=element_text(size=14)) +
  theme(axis.title.y=element_text(margin=margin(0,15,0,0))) +
  theme(axis.title.x=element_text(margin=margin(15,0,0,0))) +
  theme(plot.margin = margin(0.5,1,0.5,0.5, "cm")) +
  theme(text=element_text(size=14))


#Effect size for weekly?
RSS <- c(crossprod(modelGrantRate4$residuals))
MSE <- RSS / length(modelGrantRate4$residuals)
RMSE <- sqrt(MSE)

#Now we need cell means
GrantRateData8 <- GrantRateData7 %>%
  filter(!is.na(MayoRes) & !is.na(TreatmentAU1634)) %>%
  group_by(TreatmentAU1634,MayoRes) %>%
  summarize(mean=mean(mean, na.rm=TRUE))

A1<-GrantRateData8$mean[1]
A2<-GrantRateData8$mean[2]
B1<-GrantRateData8$mean[3]
B2<-GrantRateData8$mean[4]

#d=-0.21
d=((A1-B1) - (A2-B2))/(2*RMSE)

##### AU1634 H4 --------------

Claim1WordsData <- df2 %>%
  filter(Granted==1 & !is.na(TreatmentAU1634) &
           GrantDate > StartDate &
           GrantDate < EndDate & PubClaim1TotalWords < 3000) %>%
  select(PubClaim1TotalWords,PubClaim1UniqueWords, GrantDate,MayoGrant,TreatmentAU1634) %>%
  group_by(GrantDate,TreatmentAU1634,MayoGrant) %>%
  summarize(MeanWordsT=mean(PubClaim1TotalWords, na.rm=TRUE),
            MeanWordsU=mean(PubClaim1UniqueWords,na.rm=TRUE)) %>%
  ungroup()

#Separate dataset for PTs  
Claim1WordsDataPT <- Claim1WordsData %>%
  filter(MayoGrant=="Before")

#Parallel trends analysis
#Parallel
modelClaim1WordsTPT<- lm(MeanWordsT ~ GrantDate*TreatmentAU1634, 
                         data=Claim1WordsDataPT)
summary(modelClaim1WordsTPT)
coeftest(modelClaim1WordsTPT, vcov. = vcovHC(modelClaim1WordsTPT, type='HC3'))


#And parallel
modelClaim1WordsUPT<- lm(MeanWordsU ~ GrantDate*TreatmentAU1634, 
                         data=Claim1WordsDataPT)
summary(modelClaim1WordsUPT)
coeftest(modelClaim1WordsUPT, vcov. = vcovHC(modelClaim1WordsUPT, type='HC3'))

#So now we do actual DID analysis.
#Yes, there's an effect
modelClaim1WordsT<- lm(MeanWordsT ~ MayoGrant*TreatmentAU1634 + as.Date(GrantDate), 
                       data=Claim1WordsData)
summary(modelClaim1WordsT)

bptest(modelClaim1WordsT)
coeftest(modelClaim1WordsT, vcov. = vcovHC(modelClaim1WordsT, type='HC3'))

modelClaim1WordsTEM <- emmeans(modelClaim1WordsT, ~MayoGrant*TreatmentAU1634,
                               vcov = sandwich::vcovHC(modelClaim1WordsT,type='HC3'))

em<-as.data.frame(modelClaim1WordsTEM)
em2<-em %>%
  select(MayoGrant,TreatmentAU1634,emmean) %>%
  pivot_wider(names_from=MayoGrant,values_from=emmean) %>%
  mutate(TreatmentAU1634 = ifelse(TreatmentAU1634==0,"Control","Treatment"))
write_clip(unname(em2[1:2,2:3]))

cat_plot(modelClaim1WordsT, pred = MayoGrant, modx = TreatmentAU1634, geom = "line",
         interval = FALSE,
         colors=c("black", "blue"),
         modx.labels=c("Control","Treatment"),
         legend.main="Condition:")  +
  labs(y="Adjusted Total Words",
       x="Mayo Timing of Grant Date")+
  theme_classic()+
  theme(text=element_text(size=14)) +
  theme(axis.title.y=element_text(margin=margin(0,15,0,0))) +
  theme(axis.title.x=element_text(margin=margin(15,0,0,0))) +
  theme(plot.margin = margin(0.5,1,0.5,0.5, "cm"))


#First we obtain the RMSE
RSS <- c(crossprod(modelClaim1WordsT$residuals))
MSE <- RSS / length(modelClaim1WordsT$residuals)
RMSE <- sqrt(MSE)

#Now we need cell means
ClaimWords <- Claim1WordsData %>%
  filter(!is.na(MayoGrant) & !is.na(TreatmentAU1634)) %>%
  group_by(TreatmentAU1634,MayoGrant) %>%
  summarize(mean=mean(MeanWordsT, na.rm=TRUE))

A1<-ClaimWords$mean[1]
A2<-ClaimWords$mean[2]
B1<-ClaimWords$mean[3]
B2<-ClaimWords$mean[4]

#Then we can calculate Cohen's d
#big effect, d = 0.38
d=((A1-B1) - (A2-B2))/(2*RMSE)

#What about for unique words??
modelClaim1WordsU<- lm(MeanWordsU ~ MayoGrant*TreatmentAU1634, 
                       data=Claim1WordsData)
summary(modelClaim1WordsU)

bptest(modelClaim1WordsU)
coeftest(modelClaim1WordsU, vcov. = vcovHC(modelClaim1WordsU, type='HC3'))

modelClaim1WordsUEM <- emmeans(modelClaim1WordsU, ~MayoGrant*TreatmentAU1634,
                               vcov = sandwich::vcovHC(modelClaim1WordsU,type='HC3'))

em<-as.data.frame(modelClaim1WordsUEM)
em2<-em %>%
  select(MayoGrant,TreatmentAU1634,emmean) %>%
  pivot_wider(names_from=MayoGrant,values_from=emmean) %>%
  mutate(TreatmentAU1634 = ifelse(TreatmentAU1634==0,"Control","Treatment"))
write_clip(unname(em2[1:2,2:3]))

cat_plot(modelClaim1WordsU, pred = MayoGrant, modx = TreatmentAU1634, geom = "line",
         interval = FALSE,
         colors=c("black", "blue"),
         modx.labels=c("Control","Treatment"),
         legend.main="Condition:")  +
  labs(y="Adjusted Unique Words",
       x="Mayo Timing of Grant Date")+
  theme_classic()+
  theme(text=element_text(size=14)) +
  theme(axis.title.y=element_text(margin=margin(0,15,0,0))) +
  theme(axis.title.x=element_text(margin=margin(15,0,0,0))) +
  theme(plot.margin = margin(0.5,1,0.5,0.5, "cm"))

#First we obtain the RMSE
RSS <- c(crossprod(modelClaim1WordsU$residuals))
MSE <- RSS / length(modelClaim1WordsU$residuals)
RMSE <- sqrt(MSE)

#Now we need cell means
ClaimWords2 <- Claim1WordsData %>%
  filter(!is.na(MayoGrant) & !is.na(TreatmentAU1634)) %>%
  group_by(TreatmentAU1634,MayoGrant) %>%
  summarize(mean=mean(MeanWordsU, na.rm=TRUE))

A1<-ClaimWords2$mean[1]
A2<-ClaimWords2$mean[2]
B1<-ClaimWords2$mean[3]
B2<-ClaimWords2$mean[4]

#Then we can calculate Cohen's d
#0.37

d=((A1-B1) - (A2-B2))/(2*RMSE)

##### AU1634 H5 ----------------------------

TimetoResData <- df2 %>%
  filter(StillPending==0 & !is.na(TreatmentAU1634) &
           ResolutionDate > StartDate &
           ResolutionDate < EndDate) %>%
  select(PendencytoGrant,PendencytoAbandon,ResolutionDate,MayoRes,TreatmentAU1634) %>%
  group_by(ResolutionDate,TreatmentAU1634,MayoRes) %>%
  summarize(GrantP=mean(PendencytoGrant, na.rm=TRUE),
            AbandonP=mean(PendencytoAbandon,na.rm=TRUE)) %>%
  ungroup()

#Separate dataset for PTs  
TimetoResPT <- TimetoResData %>%
  filter(MayoRes=="Before")

#Parallel trends analysis
#Parallel for grants and abandoned
modelTimetoResGPT<- lm(GrantP ~ ResolutionDate*TreatmentAU1634, 
                       data=TimetoResPT)
summary(modelTimetoResGPT)
coeftest(modelTimetoResGPT, vcov. = vcovHC(modelTimetoResGPT, type='HC3'))

modelTimetoResAPT<- lm(AbandonP ~ ResolutionDate*TreatmentAU1634, 
                       data=TimetoResPT)
summary(modelTimetoResAPT)
coeftest(modelTimetoResAPT, vcov. = vcovHC(modelTimetoResAPT, type='HC3'))

#Actual analysis for grants
#DID effect!
modelTimetoResGDID<- lm(GrantP~ MayoRes*TreatmentAU1634 + as.Date(ResolutionDate), 
                        data=TimetoResData)
summary(modelTimetoResGDID)
coeftest(modelTimetoResGDID, vcov. = vcovHC(modelTimetoResGDID, type='HC3'))

modelTimetoResGDIDEM <- emmeans(modelTimetoResGDID, ~MayoRes*TreatmentAU1634,
                                vcov = sandwich::vcovHC(modelTimetoResGDID,type='HC3'))

em<-as.data.frame(modelTimetoResGDIDEM)
em2<-em %>%
  select(MayoRes,TreatmentAU1634,emmean) %>%
  pivot_wider(names_from=MayoRes,values_from=emmean) %>%
  mutate(TreatmentAU1634 = ifelse(TreatmentAU1634==0,"Control","Treatment"))
write_clip(unname(em2[1:2,2:3]))



cat_plot(modelTimetoResGDID, pred = MayoRes, modx = TreatmentAU1634, geom = "line",
         interval = FALSE,
         colors=c("black", "blue"),
         modx.labels=c("Control","Treatment"),
         legend.main="Condition:")  +
  labs(y="Adjusted Pendency to Grant",
       x="Mayo Timing of Resolution Date")+
  theme_classic()+
  theme(text=element_text(size=14)) +
  theme(axis.title.y=element_text(margin=margin(0,15,0,0))) +
  theme(axis.title.x=element_text(margin=margin(15,0,0,0))) +
  theme(plot.margin = margin(0.5,1,0.5,0.5, "cm")) +
theme(text=element_text(size=14))

#Now we need to get an effect size measure
#We will use Jake Westfall's formula as lsited at:
# http://jakewestfall.org/blog/index.php/2015/05/27/follow-up-what-about-uris-2n-rule/#comment-24

#First we obtain the RMSE
RSS <- c(crossprod(modelTimetoResGDID$residuals))
MSE <- RSS / length(modelTimetoResGDID$residuals)
RMSE <- sqrt(MSE)

#Now we need cell means
TimetoResData2 <- TimetoResData %>%
  filter(!is.na(MayoRes)) %>%
  group_by(TreatmentAU1634,MayoRes) %>%
  summarize(GrantP=mean(GrantP, na.rm=TRUE))

A1<-TimetoResData2$GrantP[1]
A2<-TimetoResData2$GrantP[2]
B1<-TimetoResData2$GrantP[3]
B2<-TimetoResData2$GrantP[4]

#Then we can calculate Cohen's d
#Effect size of 0.47
d=((A1-B1) - (A2-B2))/(2*RMSE)

#Actual analysis for abandoned
modelTimetoResADID<- lm(AbandonP~ MayoRes*TreatmentAU1634 + as.Date(ResolutionDate), 
                        data=TimetoResData)
summary(modelTimetoResADID)
coeftest(modelTimetoResADID, vcov. = vcovHC(modelTimetoResADID, type='HC3'))

modelTimetoResADIDEM <- emmeans(modelTimetoResADID, ~MayoRes*TreatmentAU1634,
                                vcov = sandwich::vcovHC(modelTimetoResADID,type='HC3'))

em<-as.data.frame(modelTimetoResADIDEM)
em2<-em %>%
  select(MayoRes,TreatmentAU1634,emmean) %>%
  pivot_wider(names_from=MayoRes,values_from=emmean) %>%
  mutate(TreatmentAU1634 = ifelse(TreatmentAU1634==0,"Control","Treatment"))
write_clip(unname(em2[1:2,2:3]))

cat_plot(modelTimetoResADID, pred = MayoRes, modx = TreatmentAU1634, geom = "line",
         interval = FALSE,
         colors=c("black", "blue"),
         modx.labels=c("Control","Treatment"),
         legend.main="Condition:")  +
  labs(y="Adjusted Pendency to Abandonment",
       x="Mayo Timing of Resolution Date")+
  theme_classic()+
  theme(text=element_text(size=14)) +
  theme(axis.title.y=element_text(margin=margin(0,15,0,0))) +
  theme(axis.title.x=element_text(margin=margin(15,0,0,0))) +
  theme(plot.margin = margin(0.5,1,0.5,0.5, "cm")) +
theme(text=element_text(size=14))

#Now we need to get an effect size measure
#We will use Jake Westfall's formula as lsited at:
# http://jakewestfall.org/blog/index.php/2015/05/27/follow-up-what-about-uris-2n-rule/#comment-24

#First we obtain the RMSE
RSS <- c(crossprod(modelTimetoResADID$residuals))
MSE <- RSS / length(modelTimetoResADID$residuals)
RMSE <- sqrt(MSE)

#Now we need cell means
TimetoResData2 <- TimetoResData %>%
  filter(!is.na(MayoRes)) %>%
  group_by(TreatmentAU1634,MayoRes) %>%
  summarize(GrantP=mean(GrantP, na.rm=TRUE))

A1<-TimetoResData2$GrantP[1]
A2<-TimetoResData2$GrantP[2]
B1<-TimetoResData2$GrantP[3]
B2<-TimetoResData2$GrantP[4]

#Then we can calculate Cohen's d
#Effect size of 0.25
d=((A1-B1) - (A2-B2))/(2*RMSE)

##### AU1634 H6 - APP CLAIM1 WORD COUNTS ---------------------------

PubWordData <- df2 %>%
  filter(Published=="Y" & !is.na(TreatmentAU1634) &
           FileDate > StartDate & FileDate < EndDate &
           PubClaim1TotalWords < 3000) %>%
  select(PubClaim1TotalWords,PubClaim1UniqueWords,PubWordCount,
         FileDate,MayoFile,MayoFileIm,TreatmentAU1634,Continuation) %>%
  group_by(FileDate,TreatmentAU1634,Continuation,MayoFile) %>%
  summarize(Total=mean(PubClaim1TotalWords, na.rm=TRUE),
            Unique=mean(PubClaim1UniqueWords, na.rm=TRUE),
            WordC=mean(PubWordCount, na.rm=TRUE)) %>%
  pivot_wider(names_from=Continuation, values_from=c(Total,Unique,WordC)) %>%
  rowwise() %>%
  mutate(Total_Total=mean(c(Total_Continuation,`Total_New Patent`),na.rm=TRUE),
         Unique_Total=mean(c(Unique_Continuation,`Unique_New Patent`),na.rm=TRUE),
         WordC_Total=mean(c(WordC_Continuation,`WordC_New Patent`),na.rm=TRUE)) %>%
  pivot_longer(-c(FileDate,TreatmentAU1634,MayoFile),
               names_to=c("Variable","Status"),
               names_sep="_",
               values_to="number")  %>%
  ungroup()


#Parallel trends analysis
PubWordTPT <- PubWordData %>%
  filter(MayoFile=="Before" & Variable=="Total")

PubWordUPT <- PubWordData %>%
  filter(MayoFile=="Before" & Variable=="Unique")

PubWordWCPT <- PubWordData %>%
  filter(MayoFile=="Before" & Variable=="WordC")

#Actual analyses - all are parallel?

#Total Words
modelPubWordTPT<- lm(number ~ FileDate*TreatmentAU1634*Status, 
                     data=PubWordTPT)
summary(modelPubWordTPT)
coeftest(modelPubWordTPT, vcov. = vcovHC(modelPubWordTPT, type='HC3'))

#Unique Words
modelPubWordUPT<- lm(number ~ FileDate*TreatmentAU1634*Status, 
                     data=PubWordUPT)
summary(modelPubWordUPT)
coeftest(modelPubWordUPT, vcov. = vcovHC(modelPubWordUPT, type='HC3'))

#DiD Analysis

#NEw datasets
PubWordT <- PubWordData %>%
  filter(Variable=="Total")

PubWordU <- PubWordData %>%
  filter(Variable=="Unique")


#DID for Total - nope
modelPubWordTDID<- lm(number ~ MayoFile*TreatmentAU1634*Status + as.Date(FileDate), 
                      data=PubWordT)
summary(modelPubWordTDID)

coeftest(modelPubWordTDID, vcov. = vcovHC(modelPubWordTDID, type='HC3'))

modelPubWordTDIDEM <- emmeans(modelPubWordTDID, ~MayoFile*TreatmentAU1634,
                              vcov = sandwich::vcovHC(modelPubWordTDID,type='HC3'))

#DID for Unique - nope!
modelPubWordUDID<- lm(number ~ MayoFile*TreatmentAU1634*Status + as.Date(FileDate), 
                      data=PubWordU)
summary(modelPubWordUDID)

coeftest(modelPubWordUDID, vcov. = vcovHC(modelPubWordUDID, type='HC3'))


